# [Setup]
Block 1 (light preprocessing)
Block 2 (gazetteer + spacy/stanza)
Block 3 (NER functions)
Block 7 (WordNet vague terms)
Block 8 (motion/transport terms)
Block 9 (sentence scoring prep)

# [Pipeline]
Block 1 (continue with PDF preprocessing)
Block 4 (load cleaned sentences)
Block 5 (NER + ML filtering)
Block 6 (boosting small towns)
Block 10 (GeoNames + Wikidata)
Block 11 (Wikidata enrichment)
Block 12 (clustering)


In [None]:
# Block 1: Imports and basic NLP setup
import re
import time
import requests
import unicodedata
import contractions
import nltk
import spacy
import stanza
# Fuzzy matching library
import pandas as pd
from pathlib import Path
from nltk.corpus import stopwords, wordnet as wn
from nltk.corpus import framenet as fn

# Downloads 
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("framenet_v17")

# Load models
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
# in Block 1 (setup), replace sentencizer init:
# Setup for sentence segmentation
import spacy
sentencizer = spacy.blank("en")
from spacy.pipeline import Sentencizer
sentencizer.add_pipe(Sentencizer())


nlp_spacy = spacy.load("en_core_web_sm")
stanza_pipeline = stanza.Pipeline(lang="en", processors="tokenize,ner", use_gpu=True)

# Stopwords
nltk_stops = set(stopwords.words("english"))
spacy_stops = nlp.Defaults.stop_words
all_stops = nltk_stops.union(spacy_stops)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alicja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/alicja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package framenet_v17 to
[nltk_data]     /Users/alicja/nltk_data...
[nltk_data]   Package framenet_v17 is already up-to-date!
2025-07-17 20:25:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-07-17 20:25:05 INFO: Downloaded file to /Users/alicja/stanza_resources/resources.json
2025-07-17 20:25:05 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2025-07-17 20:25:05 INFO: Using device: cpu
2025-07-17 20:25:05 INFO: Loading: tokenize
2025-07-17 20:25:06 INFO: Loading: mwt
2025-07-17 20:25:06 INFO: Loading: ner
2025-07-17 20:25:10 INFO: Done loading processors!


In [None]:
# Block 2: Extract text from PDF using PyMuPDF
import fitz  # PyMuPDF

pdf_path = "/Users/alicja/Desktop/BA-code/Corpora/MotorcycleDiaries.pdf"
doc = fitz.open(pdf_path)

# Adjust page range as needed
pages_to_read = doc[29:148]
raw_text = "\n".join(page.get_text() for page in pages_to_read)

print(" PDF text extracted")


✅ PDF text extracted


In [3]:
# Block 3: Light cleaning (preserve structure)
def normalize_punctuation(text: str) -> str:
    return (
        text.replace("“", '"').replace("”", '"')
            .replace("’", "'").replace("‘", "'")
            .replace("—", "-").replace("–", "-")
    )

def clean_light(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = contractions.fix(text)
    text = normalize_punctuation(text)
    
    # Remove URLs, emails, HTML tags, phone numbers
    patterns = [
        r'https?://\S+', r'\S+@\S+', r'<.*?>', r'\+?\d[\d\-\(\)\s]{5,}\d'
    ]
    for pat in patterns:
        text = re.sub(pat, " ", text)
        
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [4]:
# Block 4: Sentence segmentation using spaCy sentencizer
def segment_sentences(text: str) -> list:
    doc = sentencizer(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    return sentences


In [None]:
# Block 5:  NLP cleaning
def clean_heavy(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    doc = list(nlp.pipe([text], batch_size=1000, n_process=1))[0]
    tokens = [
        tok.lemma_ for tok in doc
        if tok.lemma_.isalpha()
        and tok.lemma_ not in all_stops
        and tok.lemma_ != "-PRON-"
    ]
    return " ".join(tokens)


In [None]:
# ✅ Block 6: Build gazetteer using GeoNames API 
import requests
import time

geonames_username = "alicjab"  

# ISO codes for all South American countries
countries = [
    "AR", "BO", "BR", "CL", "CO", "EC", "GY", "PY", "PE", "SR", "UY", "VE",  # South America
    "MX", "GT", "HN", "SV", "NI", "CR", "PA"  # Central America / Mesoamerica
]


gazetteer = set()
max_rows = 1000  # max per request (GeoNames limit)

print("🌍 Downloading cities from GeoNames for all of South America...")

for country_code in countries:
    loaded = 0
    try:
        for start_row in range(0, 5000, max_rows):  # Optional paging: up to 5,000 per country
            url = "http://api.geonames.org/searchJSON"
            params = {
                "featureClass": "P",      # populated places
                "country": country_code,
                "maxRows": max_rows,
                "startRow": start_row,
                "orderby": "population",  # most important first
                "username": geonames_username
            }
            response = requests.get(url, params=params, timeout=10)
            response.raise_for_status()
            data = response.json()
            cities = data.get("geonames", [])
            if not cities:
                break
            city_names = [
                entry["name"].lower() for entry in cities if "name" in entry
            ]
            gazetteer.update(city_names)
            loaded += len(city_names)
            time.sleep(1)  # Respect GeoNames rate limits
        print(f"Loaded {loaded} cities from {country_code}")
    except Exception as e:
        print(f"Error downloading cities for {country_code}: {e}")

print(f"📌 Total unique cities in gazetteer: {len(gazetteer)}")


🌍 Downloading cities from GeoNames for all of South America...
✅ Loaded 5000 cities from AR
✅ Loaded 5000 cities from BO
✅ Loaded 5000 cities from BR
✅ Loaded 5000 cities from CL
✅ Loaded 5000 cities from CO
✅ Loaded 5000 cities from EC
✅ Loaded 906 cities from GY
✅ Loaded 5000 cities from PY
✅ Loaded 5000 cities from PE
✅ Loaded 548 cities from SR
✅ Loaded 1077 cities from UY
✅ Loaded 5000 cities from VE
✅ Loaded 5000 cities from MX
✅ Loaded 5000 cities from GT
✅ Loaded 5000 cities from HN
✅ Loaded 4824 cities from SV
✅ Loaded 3039 cities from NI
✅ Loaded 5000 cities from CR
✅ Loaded 5000 cities from PA
📌 Total unique cities in gazetteer: 48784


In [None]:
# Block 7: Apply cleaning and segmentation to raw text
light_cleaned = clean_light(raw_text)
sentences = segment_sentences(light_cleaned)

# Optional tagging for geoparsing
sentences_with_tags = [f"[SENT {i+1}] {s}" for i, s in enumerate(sentences)]

# Heavy-cleaned version for downstream NLP
heavy_cleaned = clean_heavy(light_cleaned)

# Save outputs
Path("outputs").mkdir(exist_ok=True)

with open("outputs/cleaned_motorcycle_diaries_geoparsing.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(sentences_with_tags))

with open("outputs/cleaned_motorcycle_diaries_nlp.txt", "w", encoding="utf-8") as f:
    f.write(heavy_cleaned)

print("Cleaned versions saved")


✅ Cleaned versions saved


In [None]:
# Block 8: Load sentence list with IDs from saved file
with open("outputs/cleaned_motorcycle_diaries_geoparsing.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

sentence_data = []
for line in lines:
    match = re.match(r"\[SENT (\d+)\] (.+)", line.strip())
    if match:
        sent_id = int(match.group(1))
        sent_text = match.group(2)
        sentence_data.append((sent_id, sent_text))

print(f" Loaded {len(sentence_data)} tagged sentences")


✅ Loaded 2476 tagged sentences


In [None]:
# Block 9: NER + Gazetteer with optional Stanza and progress bar (with short-name filtering)
from tqdm import tqdm
from rapidfuzz import fuzz
import re

# Toggle Stanza use (set to False to avoid slowdowns)
USE_STANZA = False

def extract_entities_spacy(text):
    doc = nlp_spacy(text)
    return [(ent.text, ent.label_, ent.start_char, ent.end_char)
            for ent in doc.ents if ent.label_ in {"GPE", "LOC"}]

def extract_entities_stanza(text):
    doc = stanza_pipeline(text)
    results = []
    for sent in doc.sentences:
        for ent in sent.ents:
            if ent.type in {"GPE", "LOC"}:
                results.append((ent.text, ent.type, ent.start_char, ent.end_char))
    return results

def match_gazetteer(text, known_places):
    """
    gazetteer matching using word-boundary regex to reduce false positives.
    Matches longer place names first to avoid substring collisions.
    filters out short entries (≤3 chars) to reduce noise.
    """
    text_lower = text.lower()
    matches = []

    # Ignore very short place names (common source of noise)
    known_places_sorted = sorted([p for p in known_places if len(p) > 3], key=len, reverse=True)

    for place in known_places_sorted:
        pattern = r'\b{}\b'.format(re.escape(place.lower()))
        for m in re.finditer(pattern, text_lower):
            start, end = m.start(), m.end()
            match_text = text[start:end]
            matches.append((match_text, "GAZETTEER", start, end))
    return matches

def combine_ner_gazetteer(sentences, gazetteer):
    results = []
    for sent_id, text in tqdm(sentences, desc="NER + Gazetteer"):
        try:
            ents_spacy = extract_entities_spacy(text)
            ents_stanza = extract_entities_stanza(text) if USE_STANZA else []
            ents_gazetteer = match_gazetteer(text, gazetteer)

            all_ents = ents_spacy + ents_stanza + ents_gazetteer
            seen = set()
            for ent_text, label, start, end in all_ents:
                norm = ent_text.lower()
                if not any(fuzz.ratio(norm, s) > 90 for s in seen):
                    seen.add(norm)
                    results.append({
                        "sentence_id": sent_id,
                        "entity": ent_text,
                        "entity_norm": norm,
                        "label": label,
                        "start_char": start,
                        "end_char": end,
                        "sentence": text
                    })
        except Exception as e:
            print(f" Error in sentence {sent_id}: {e}")
    return results


print("🔎 Running ensemble NER + gazetteer matching...")
entities_combined = combine_ner_gazetteer(sentence_data, gazetteer)

df_combined = pd.DataFrame(entities_combined)
df_combined.to_csv("outputs/geoparsing_ner_ensemble.csv", index=False)
print(" Saved: outputs/geoparsing_ner_ensemble.csv")


🔎 Running ensemble NER + gazetteer matching...


NER + Gazetteer: 100%|██████████| 2476/2476 [41:27<00:00,  1.00s/it]

✅ Saved: outputs/geoparsing_ner_ensemble.csv





In [None]:
# Block 10: Train ML model to filter real geographic entities
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load ensemble NER output
df = pd.read_csv("outputs/geoparsing_ner_ensemble.csv")

# Create features
df["label_GPE"] = (df["label"] == "GPE").astype(int)
df["label_LOC"] = (df["label"] == "LOC").astype(int)

symbolic_keywords = ["freedom", "struggle", "liberation", "future", "dream", "cause", "revolution", "hope", "people"]
df["symbolic_flagged"] = df["sentence"].str.contains("|".join(symbolic_keywords), flags=re.IGNORECASE, na=False)

expected_countries = ["argentina", "chile", "peru", "bolivia", "colombia", "venezuela"]
df["country_valid"] = df["sentence"].str.lower().apply(
    lambda x: int(any(country in x for country in expected_countries))
)

df["fuzzy_score"] = df.apply(
    lambda row: fuzz.ratio(str(row["entity"]).lower(), str(row["entity_norm"]).lower()), axis=1
)
df["fuzzy_score_scaled"] = df["fuzzy_score"] / 100.0

df["auto_label"] = ((df["country_valid"] == 1) & (~df["symbolic_flagged"])).astype(int)

# Train/test split
features = df[["label_GPE", "label_LOC", "symbolic_flagged", "country_valid", "fuzzy_score_scaled"]]
target = df["auto_label"]

X_train, X_test, y_train, y_test = train_test_split(features, target, stratify=target, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
print(" Classification Report:")
print(classification_report(y_test, y_pred))

# Predict on all
X_all_scaled = scaler.transform(features)
df["geo_confidence"] = clf.predict_proba(X_all_scaled)[:, 1]
df["filtered_out_ml"] = df["geo_confidence"] < 0.5

# Save outputs
df_filtered = df[~df["filtered_out_ml"]].copy()
df.to_csv("outputs/geoparsing_ensemble_flagged_with_ml.csv", index=False)
df_filtered.to_csv("outputs/geoparsing_ensemble_final_ml_filtered.csv", index=False)

print(" ML filtering complete")


📊 Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       233
           1       1.00      1.00      1.00        37

    accuracy                           1.00       270
   macro avg       1.00      1.00      1.00       270
weighted avg       1.00      1.00      1.00       270

✅ ML filtering complete


In [None]:
# Block 11: Boost confidence for small towns in South America
df = pd.read_csv("outputs/geoparsing_ensemble_final_ml_filtered.csv")

# Define target countries
target_countries = [
    "argentina", "chile", "peru", "bolivia", "colombia",
    "venezuela", "ecuador", "brazil", "uruguay", "paraguay"
]

# Prepare for matching
df["entity_norm_lower"] = df["entity_norm"].str.lower()

# Try to load population-enriched data
try:
    enriched = pd.read_csv("outputs/geoparsing_final_enriched.csv")
    enriched["entity_norm_lower"] = enriched["entity_norm"].str.lower()

    df = df.merge(
        enriched[["entity_norm_lower", "country", "population"]],
        on="entity_norm_lower", how="left"
    )

    # Mark small towns
    df["boost_small_town"] = (
        df["population"].fillna(0).lt(50000) &
        df["country"].str.lower().isin(target_countries)
    )
    print(f" Boosted {df['boost_small_town'].sum()} small towns")

except Exception as e:
    print(f" Could not load enriched data: {e}")
    df["boost_small_town"] = False

# Apply confidence boost
df["geo_confidence_boosted"] = df["geo_confidence"]
df.loc[df["boost_small_town"], "geo_confidence_boosted"] = 0.9

# Save result
df.to_csv("outputs/geoparsing_ensemble_final_ml_boosted.csv", index=False)
print("Saved: geoparsing_ensemble_final_ml_boosted.csv")


⚠️ Could not load enriched data: [Errno 2] No such file or directory: 'outputs/geoparsing_final_enriched.csv'
✅ Saved: geoparsing_ensemble_final_ml_boosted.csv


In [None]:

# Block 12: vague term filtering (ML + WordNet) + South America region restriction

from nltk.corpus import wordnet as wn
import pandas as pd

df = pd.read_csv("outputs/geoparsing_ensemble_final_ml_boosted.csv")

#  Normalize entity name 
df["entity_norm"] = df["entity_norm"].astype(str).str.lower().str.strip()

# === STEP 1: ML-learned vague terms ===
term_stats = df.groupby("entity_norm").agg({
    "geo_confidence_boosted": "mean",
    "entity": "count"
}).rename(columns={
    "entity": "freq",
    "geo_confidence_boosted": "avg_conf"
}).reset_index()

learned_vague = term_stats[
    (term_stats["freq"] >= 3) &
    (term_stats["avg_conf"] < 0.35)
]["entity_norm"].tolist()
print(f" ML-learned vague terms: {len(learned_vague)}")

# === STEP 2: WordNet vague terms ===
location_synsets = [
    wn.synset("location.n.01"),
    wn.synset("region.n.01"),
    wn.synset("area.n.01"),
    wn.synset("place.n.01"),
    wn.synset("territory.n.01")
]

vague_terms_wordnet = set()
for syn in location_synsets:
    for hypo in syn.closure(lambda s: s.hyponyms()):
        for lemma in hypo.lemmas():
            vague_terms_wordnet.add(lemma.name().lower().replace("_", " "))
print(f" WordNet vague terms: {len(vague_terms_wordnet)}")

# === STEP 3: Combined vague term filtering ===
df["is_vague_combined"] = df.apply(
    lambda row: (
        row["entity_norm"] in learned_vague or
        (row["label"] == "LOC" and row["entity_norm"] in vague_terms_wordnet)
    ),
    axis=1
)

df_filtered = df[~df["is_vague_combined"]].copy()
print(f" Removed {df['is_vague_combined'].sum()} entities (ML or WordNet flagged)")

# === STEP 4: South America region restriction ===
sa_countries = {
    "argentina", "bolivia", "brazil", "chile", "colombia",
    "ecuador", "guyana", "paraguay", "peru", "suriname", "uruguay", "venezuela"
}

if "country" in df_filtered.columns:
    df_filtered["country"] = df_filtered["country"].astype(str).str.lower()
    df_filtered = df_filtered[df_filtered["country"].isin(sa_countries)].copy()
    print(f"🌎 After SA region filter: {len(df_filtered)} rows")
else:
    print(" 'country' column not found. Cannot apply region filter.")

# === Final Save ===
df_filtered.to_csv("outputs/geoparsing_ner_ensemble_filtered_southamerica.csv", index=False)
print(" Saved: geoparsing_ner_ensemble_filtered_southamerica.csv")



📉 ML-learned vague terms: 0
📚 WordNet vague terms: 1786
❌ Removed 0 entities (ML or WordNet flagged)
⚠️ 'country' column not found. Cannot apply region filter.
✅ Saved: geoparsing_ner_ensemble_filtered_southamerica.csv


In [None]:
# Block 13 : Contextual Scoring with enhanced inputs
import spacy
import re
import dateparser.search
from tqdm import tqdm
from nltk.corpus import wordnet as wn, framenet as fn

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# === Load motion verbs ===
motion_frames = ['Motion', 'Travel', 'Self_motion', 'Arriving', 'Departing']
motion_verbs = set()
for frame in motion_frames:
    try:
        for lu in fn.frame_by_name(frame).lexUnit.values():
            if lu['name'].endswith('.v'):
                motion_verbs.add(lu['name'].split('.')[0].lower())
    except:
        continue

# === Load transport terms ===
vehicle_syn = wn.synset('vehicle.n.01')
transport_terms = set()
for syn in vehicle_syn.closure(lambda s: s.hyponyms()):
    for lemma in syn.lemmas():
        transport_terms.add(lemma.name().lower().replace('_', ' '))

# === Load filtered file after SA restriction ===
df = pd.read_csv("outputs/geoparsing_ner_ensemble_filtered_southamerica.csv")

# === Load sentence map ===
with open("outputs/cleaned_motorcycle_diaries_geoparsing.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
sentence_map = {}
for line in lines:
    if line.strip().startswith("[SENT"):
        sid = int(line.split("]")[0].split()[1])
        sentence_map[sid] = line.split("]")[1].strip()

# === Scoring logic ===
scored_rows = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Scoring entities"):
    sid = row["sentence_id"]
    entity = row["entity"]
    norm = row["entity_norm"]
    label = row["label"]
    sentence = sentence_map.get(sid, "")

    doc = nlp(sentence)
    score = 0
    entity_token = None

    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"} and ent.text.lower().strip() == entity.lower().strip():
            entity_token = ent.root
            break

    if not entity_token:
        continue

    sentence_lower = sentence.lower()
    entity_lower = entity.lower()

    if any(verb in sentence_lower for verb in motion_verbs):
        score += 1
    if any(term in sentence_lower for term in transport_terms):
        score += 1
    if re.search(r'\b(in|to|at)\s+' + re.escape(entity_lower) + r'\b', sentence_lower):
        score += 1
    if re.search(r"\b" + re.escape(entity_lower) + r"['’]s\b", sentence_lower):
        score -= 1
    if re.search(r"\bof\s+" + re.escape(entity_lower) + r"\b", sentence_lower):
        score -= 1
    if re.search(r"\bfor\s+" + re.escape(entity_lower) + r"\b", sentence_lower):
        score -= 1
    if dateparser.search.search_dates(sentence):
        score += 1

    scored_rows.append({
        "sentence_id": sid,
        "entity": entity,
        "entity_norm": norm,
        "label": label,
        "sentence": sentence,
        "score": score,
        "latitude": row.get("latitude"),
        "longitude": row.get("longitude"),
        "country": row.get("country")
    })

scored_df = pd.DataFrame(scored_rows)
scored_df = scored_df.sort_values(by=["score", "sentence_id"], ascending=[False, True])
scored_df.to_csv("outputs/geoparsing_scored_candidates.csv", index=False)
print("📄 Saved: outputs/geoparsing_scored_candidates.csv")




  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
  for synset in acyclic_breadth_first(self, rel, depth):
Scoring entities: 100%|██████████| 184/184 [00:15<00:00, 11.63it/s]

📄 Saved: outputs/geoparsing_scored_candidates.csv





In [None]:
# Block 14: Enrich entities with coordinates via GeoNames → Wikidata → OSM
import time
import requests
import pandas as pd
from tqdm import tqdm

# === Load scored candidates ===
df = pd.read_csv("outputs/geoparsing_scored_candidates.csv")
places = df["entity_norm"].dropna().unique()
geonames_username = "alicjab"

geo_data = {}

# === Step 1: GeoNames ===
print("🌍 Querying GeoNames...")
for place in tqdm(places, desc="GeoNames queries"):
    try:
        params = {"q": place, "maxRows": 1, "username": geonames_username}
        r = requests.get("http://api.geonames.org/searchJSON", params=params, timeout=10).json()
        if not r.get("geonames"):
            raise ValueError("No result")
        g = r["geonames"][0]
        geo_data[place] = {
            "latitude": float(g["lat"]),
            "longitude": float(g["lng"]),
            "country": g.get("countryName"),
            "population": int(g.get("population", 0))
        }
    except Exception:
        geo_data[place] = {"latitude": None, "longitude": None, "country": None, "population": None}
    time.sleep(1)  # GeoNames rate limit

geo_df = pd.DataFrame.from_dict(geo_data, orient="index")
geo_df.index.name = "entity_norm"
geo_df.reset_index(inplace=True)

# === Merge GeoNames results ===
df_enriched = df.merge(geo_df, on="entity_norm", how="left")

# === Step 2: Wikidata fallback ===
print("🔁 Querying Wikidata for missing coordinates...")
if "latitude" not in df_enriched.columns:
    df_enriched["latitude"] = None
    df_enriched["longitude"] = None

missing_places = df_enriched[df_enriched["latitude"].isna()]["entity_norm"].dropna().unique()

def query_wikidata_coords(place):
    try:
        search_url = "https://www.wikidata.org/w/api.php"
        search_params = {
            "action": "wbsearchentities",
            "search": place,
            "language": "en",
            "format": "json"
        }
        r = requests.get(search_url, params=search_params, timeout=10).json()
        if not r["search"]:
            return {"entity_norm": place, "wikidata_lat": None, "wikidata_lon": None}
        qid = r["search"][0]["id"]
        entity = requests.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json").json()
        coords = entity["entities"][qid]["claims"].get("P625", [{}])[0].get("mainsnak", {}).get("datavalue", {}).get("value", {})
        return {"entity_norm": place, "wikidata_lat": coords.get("latitude"), "wikidata_lon": coords.get("longitude")}
    except Exception:
        return {"entity_norm": place, "wikidata_lat": None, "wikidata_lon": None}

wikidata_results = pd.DataFrame([query_wikidata_coords(p) for p in tqdm(missing_places, desc="Wikidata queries")])
df_enriched = df_enriched.merge(wikidata_results, on="entity_norm", how="left")
df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["wikidata_lat"])
df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["wikidata_lon"])

# === Step 3: OSM fallback ===
print("🧭 Querying OpenStreetMap for unresolved locations...")
nominatim_places = df_enriched[df_enriched["latitude"].isna()]["entity_norm"].dropna().unique()

def query_osm(place):
    try:
        r = requests.get(
            "https://nominatim.openstreetmap.org/search",
            params={"q": place, "format": "json", "limit": 1},
            headers={"User-Agent": "Geoparser/1.0"},
            timeout=10
        ).json()
        if not r:
            return {"entity_norm": place, "osm_lat": None, "osm_lon": None}
        return {
            "entity_norm": place,
            "osm_lat": float(r[0]["lat"]),
            "osm_lon": float(r[0]["lon"])
        }
    except Exception:
        return {"entity_norm": place, "osm_lat": None, "osm_lon": None}

osm_results = pd.DataFrame([query_osm(p) for p in tqdm(nominatim_places, desc="OSM queries")])
df_enriched = df_enriched.merge(osm_results, on="entity_norm", how="left")
df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["osm_lat"])
df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["osm_lon"])


df_enriched.to_csv("outputs/geoparsing_final_enriched.csv", index=False)
print("✅ Final enriched file saved: geoparsing_final_enriched.csv")


🌍 Querying GeoNames...


GeoNames queries: 100%|██████████| 43/43 [04:28<00:00,  6.25s/it]


🔁 Querying Wikidata for missing coordinates...


Wikidata queries: 100%|██████████| 43/43 [00:06<00:00,  6.15it/s]
  df_enriched["latitude"] = df_enriched["latitude"].combine_first(df_enriched["wikidata_lat"])
  df_enriched["longitude"] = df_enriched["longitude"].combine_first(df_enriched["wikidata_lon"])


🧭 Querying OpenStreetMap for unresolved locations...


OSM queries: 100%|██████████| 42/42 [00:41<00:00,  1.02it/s]

✅ Final enriched file saved: geoparsing_final_enriched.csv





In [None]:
# Block 15: Geographic Outlier Filtering using DBSCAN (corrected input + centroid-aware reweighting + continent filtering)
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
from geopy.distance import geodesic
from tqdm import tqdm
import reverse_geocoder as rg


df = pd.read_csv("outputs/geoparsing_final_enriched.csv")

# Drop rows without coordinates
df_geo = df.dropna(subset=["latitude", "longitude"]).copy()
coords_rad = np.radians(df_geo[["latitude", "longitude"]].values)

# Cluster with DBSCAN using haversine distance (in radians)
clustering = DBSCAN(eps=0.5, min_samples=2, metric='haversine')
df_geo["geo_cluster"] = clustering.fit_predict(coords_rad)

# === Compute centroids of each valid cluster ===
centroids = (
    df_geo[df_geo["geo_cluster"] != -1]
    .groupby("geo_cluster")[["latitude", "longitude"]]
    .mean()
    .to_dict("index")
)

# === Reweight scores for outliers ===
outliers = df_geo[df_geo["geo_cluster"] == -1].copy()
non_outliers = df_geo[df_geo["geo_cluster"] != -1].copy()

reweighted = []
for _, row in tqdm(outliers.iterrows(), total=len(outliers), desc="↩️ Reweighting outliers"):
    min_dist_km = float("inf")
    entity_point = (row["latitude"], row["longitude"])

    for c in centroids.values():
        centroid_point = (c["latitude"], c["longitude"])
        dist = geodesic(entity_point, centroid_point).km
        if dist < min_dist_km:
            min_dist_km = dist

    # If outlier is <500 km from any cluster, keep it with downgraded score
    if min_dist_km < 500:
        row["geo_cluster"] = -2  # kept but marked as downgraded outlier
        row["geo_score_adjusted"] = row.get("score", 0) - 1
        reweighted.append(row)

# Combine cleaned + downgraded outliers
final_df = pd.concat([non_outliers, pd.DataFrame(reweighted)], ignore_index=True)
print(f" Final post-reweighting size: {len(final_df)} rows")

# === Filter by continent (keep only South America) ===
def get_continent(lat, lon):
    try:
        results = rg.search((lat, lon), mode=1)
        cc = results[0]['cc']
        # ISO country codes in South America
        south_america = {
            'AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'GY', 'PY', 'PE', 'SR', 'UY', 'VE'
        }
        return 'South America' if cc in south_america else 'Other'
    except:
        return 'Unknown'

print("🌎 Filtering by continent...")
final_df["continent"] = final_df.apply(
    lambda row: get_continent(row["latitude"], row["longitude"]), axis=1
)

before_filter = len(final_df)
final_df = final_df[final_df["continent"] == "South America"].copy()
print(f"🌍 Removed {before_filter - len(final_df)} non-South American entries")

# Save final output
final_df.to_csv("outputs/geoparsing_final_scored_clustered.csv", index=False)
print("Saved final cleaned + reweighted + region-filtered version: geoparsing_final_scored_clustered.csv")

↩️ Reweighting outliers: 100%|██████████| 2/2 [00:00<00:00, 1387.01it/s]


🧭 Final post-reweighting size: 108 rows
🌎 Filtering by continent...
Loading formatted geocoded file...
🌍 Removed 18 non-South American entries
✅ Saved final cleaned + reweighted + region-filtered version: geoparsing_final_scored_clustered.csv


In [None]:
#Block 16: Visualize clustered results using Plotly

import pandas as pd
import plotly.express as px

# Load clustered data
df = pd.read_csv("outputs/geoparsing_final_scored_clustered.csv")

# Drop NaNs (should already be clean)
df = df.dropna(subset=["latitude", "longitude"])

# Rename cluster column for consistency with Plotly
df["cluster"] = df["geo_cluster"]

# Create hover label
df["hover"] = df["entity"] + " (cluster " + df["cluster"].astype(str) + ")"

# Basic scatter geo map
fig = px.scatter_geo(
    df,
    lat="latitude",
    lon="longitude",
    text="entity",
    hover_name="hover",
    color="cluster",
    title="Clustered Location Mentions from Text",
    projection="natural earth"
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(legend_title_text='Cluster ID')
fig.show()

