In [1]:
# Cell 1: Setup and Load Data

# 1. Import all required Python libraries.
# 2. Set the project's root path and loads the configuration.
# 3. Find and load the 'seed_enwiki_*.csv' file created by the first notebook.

import time
import json
import re
import requests
import pandas as pd
import sqlite3
import os
import itertools
from pathlib import Path
from tqdm.notebook import tqdm
from collections import Counter
import ast

# --- Project Configuration ---
ROOT = Path.cwd()
if ROOT.name == "notebooks":
    ROOT = ROOT.parent

CONF = json.load(open(ROOT / "conf" / "project.json"))
print(f"✅ Project Root: {ROOT}")
print(f"✅ Config loaded for project: '{CONF['project']}'")

# --- Load Seed Data from Notebook 01 ---
# Find the most recent seed file in the 'data/raw' directory
try:
    seed_path = sorted((ROOT / "data" / "raw").glob("seed_enwiki_*.csv"))[-1]
    seed_df = pd.read_csv(seed_path)
    print(f"✅ Loaded seed file: {seed_path.name} | Rows: {len(seed_df):,}")
except IndexError:
    print("❌ Error: No seed file found in 'data/raw/'. Please run notebook 01 first.")
    # Create an empty df to allow the notebook to load, but it will fail later
    seed_df = pd.DataFrame()

# --- Create Output Directories ---
TMP_ENRICHED_DIR = ROOT / "data" / "processed" / "tmp_enriched"
TMP_NORMALIZED_DIR = ROOT / "data" / "processed" / "tmp_normalized"
TMP_ENRICHED_DIR.mkdir(parents=True, exist_ok=True)
TMP_NORMALIZED_DIR.mkdir(parents=True, exist_ok=True)

print("✅ Setup complete. Ready to proceed.")

✅ Project Root: C:\Users\drrahman\wiki-gaps-project
✅ Config loaded for project: 'wiki-gaps'
✅ Loaded seed file: seed_enwiki_20251007-213232.csv | Rows: 1,125,607
✅ Setup complete. Ready to proceed.


In [3]:
# Cell 2: API Session and Cache Setup 

# This cell prepares the tools for data enrichment. 
# It sets up a robust session for making API requests and initializes a local SQLite database to cache all results, making the long-running process resumable.

from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# --- API Session Setup ---
def make_api_session(user_agent: str):
    """Creates a robust requests session with retries and a custom user agent."""
    s = requests.Session()
    s.headers.update({"User-Agent": user_agent})
    retries = Retry(
        total=6, connect=6, read=6, status=6,
        status_forcelist=(429, 502, 503, 504),
        backoff_factor=0.8,
        respect_retry_after_header=True
    )
    s.mount("https://", HTTPAdapter(max_retries=retries))
    return s

WIKIDATA_API = "https://www.wikidata.org/w/api.php"
USER_AGENT = f"WikiGaps/0.1 (contact: ashhik96@gmail.com)"
SESSION_WD = make_api_session(USER_AGENT)

print("✅ API session configured.")

# --- SQLite Cache Setup ---
CACHE_DB_PATH = ROOT / "data" / "cache" / "wd_cache.sqlite"
conn = sqlite3.connect(CACHE_DB_PATH)
cur = conn.cursor()

# Define the schema for storing entity data and labels. 
cur.executescript("""
    PRAGMA journal_mode=WAL;
    PRAGMA synchronous=NORMAL;

    CREATE TABLE IF NOT EXISTS entity_min (
      qid TEXT PRIMARY KEY,
      title TEXT,
      gender_qids TEXT,
      country_qids TEXT,
      occupation_qids TEXT,
      pob_qids TEXT
    );

    CREATE TABLE IF NOT EXISTS label (
      qid TEXT NOT NULL,
      lang TEXT NOT NULL,
      label TEXT,
      PRIMARY KEY (qid, lang)
    );
""")
conn.commit()
print(f"✅ SQLite cache ready at: {CACHE_DB_PATH}")

✅ API session configured.
✅ SQLite cache ready at: C:\Users\drrahman\wiki-gaps-project\data\cache\wd_cache.sqlite


In [4]:
# Cell 3: Cache Helper Functions

# This cell defines the helper functions that the script will use to read from and write to the SQLite cache. 

def cache_get_entity_min(qids: list[str]) -> dict:
    """Retrieves full entity records from the cache."""
    if not qids: return {}
    qmarks = ",".join("?" for _ in qids)
    cur.execute(f"""
        SELECT qid, title, gender_qids, country_qids, occupation_qids, pob_qids
        FROM entity_min WHERE qid IN ({qmarks})
    """, qids)
    
    records = {}
    for r in cur.fetchall():
        records[r[0]] = {
            "qid": r[0], "title": r[1], "gender_qids": r[2] or "", 
            "country_qids": r[3] or "", "occupation_qids": r[4] or "",
            "pob_qids": r[5] or ""
        }
    return records

def cache_put_entity_min(rows: list[dict]):
    """Inserts or replaces entity records in the cache."""
    if not rows: return
    # Ensure all keys are present in each row dict to prevent errors
    for r in rows:
        r.setdefault("pob_qids", "")
        
    cur.executemany("""
        INSERT OR REPLACE INTO entity_min
        (qid, title, gender_qids, country_qids, occupation_qids, pob_qids)
        VALUES (:qid, :title, :gender_qids, :country_qids, :occupation_qids, :pob_qids)
    """, rows)
    conn.commit()

def cache_get_labels(qids: list[str], lang="en") -> dict:
    """Retrieves labels for a list of QIDs."""
    if not qids: return {}
    qmarks = ",".join("?" for _ in qids)
    cur.execute(f"SELECT qid, label FROM label WHERE lang=? AND qid IN ({qmarks})", [lang, *qids])
    return dict(cur.fetchall())

def cache_put_labels(mapping: dict, lang="en"):
    """Inserts or replaces labels in the cache."""
    if not mapping: return
    cur.executemany(
        "INSERT OR REPLACE INTO label(qid, lang, label) VALUES (?,?,?)",
        [(qid, lang, lbl) for qid, lbl in mapping.items()]
    )
    conn.commit()

print("✅ Cache helper functions are ready.")

✅ Cache helper functions are ready.


In [5]:
# Cell 4: Wikidata API Functions

# This cell defines the functions that will communicate with the live Wikidata API.
# One function gets the enriched data (gender, country, etc.), and the other gets the human-readable labels for the Wikidata QIDs.

def wd_get_enriched_entities(qids: list[str], lang="en") -> tuple[list[dict], set]:
    """
    Fetches enriched data for up to 50 QIDs from the Wikidata API.
    
    Returns a tuple containing:
      - A list of dicts with the structured data for each entity.
      - A set of all unique "value" QIDs encountered (for fetching labels later).
    """
    if not qids: return [], set()
    
    params = {
        "action": "wbgetentities",
        "ids": "|".join(qids),
        "props": "claims|sitelinks",
        "languages": lang,
        "format": "json"
    }
    
    try:
        r = SESSION_WD.get(WIKIDATA_API, params=params, timeout=90)
        r.raise_for_status()
        data = r.json()
    except requests.RequestException as e:
        print(f"❌ API Error: {e}")
        return [], set()

    entities = data.get("entities", {})
    output_rows = []
    value_qids_to_label = set()

    for qid, ent in entities.items():
        # Helper to extract QIDs from a claim and add them to our set for labeling
        def get_claim_qids(prop_id):
            qids_found = []
            for claim in ent.get("claims", {}).get(prop_id, []):
                val = claim.get("mainsnak", {}).get("datavalue", {}).get("value")
                if isinstance(val, dict) and "id" in val:
                    qid_val = val["id"]
                    qids_found.append(qid_val)
                    value_qids_to_label.add(qid_val)
            return "|".join(dict.fromkeys(qids_found)) # Preserve order, remove duplicates

        title = ent.get("sitelinks", {}).get(f"{lang}wiki", {}).get("title")
        
        output_rows.append({
            "qid": qid,
            "title": title,
            "gender_qids": get_claim_qids(CONF["attrs"]["gender"]),
            "country_qids": get_claim_qids(CONF["attrs"]["country"]),
            "occupation_qids": get_claim_qids(CONF["attrs"]["occupation"]),
            "pob_qids": get_claim_qids("P19"), # Place of Birth
        })
        
    return output_rows, value_qids_to_label


def wd_get_labels(qids: list[str], lang="en") -> dict:
    """Fetches labels for up to 50 QIDs."""
    if not qids: return {}
    
    params = {
        "action": "wbgetentities",
        "ids": "|".join(qids[:50]),
        "props": "labels",
        "languages": lang,
        "format": "json"
    }
    
    try:
        r = SESSION_WD.get(WIKIDATA_API, params=params, timeout=60)
        r.raise_for_status()
        entities = r.json().get("entities", {})
        return {qid: ent.get("labels", {}).get(lang, {}).get("value") for qid, ent in entities.items()}
    except requests.RequestException as e:
        print(f"❌ API Error fetching labels: {e}")
        return {}

print("✅ Wikidata API helper functions are ready.")

✅ Wikidata API helper functions are ready.


In [6]:
# Cell 5: Main Enrichment Loop

# This is the main, long-running cell of the notebook. It iterates through  all 1.1 million QIDs in chunks. 
# For each chunk, it checks the cache, fetches any missing data from the Wikidata API, and saves the enriched chunk to a temporary file.

# --- Configuration ---
CHUNK_SIZE = 20000  # How many profiles to process before saving a file
BATCH_SIZE = 50     # How many QIDs to send to the API at once
API_SLEEP = CONF["api_sleep"]
LANG = CONF["language"]

# --- Resumability Logic ---
# Find the last completed chunk to avoid re-processing everything
existing_chunks = sorted(TMP_ENRICHED_DIR.glob("enriched_chunk_*.csv"))
start_chunk_num = 0
if existing_chunks:
    last_file = existing_chunks[-1].name
    match = re.search(r"enriched_chunk_(\d+)\.csv$", last_file)
    if match:
        start_chunk_num = int(match.group(1))

resume_offset = start_chunk_num * CHUNK_SIZE
print(f"▶️ Resuming from row {resume_offset:,} (found {start_chunk_num} completed chunks).")

# Create an iterator for QIDs, skipping those already processed
qids_iter = itertools.islice(seed_df["qid"].astype(str), resume_offset, None)

# --- Main Loop ---
current_chunk_num = start_chunk_num
while True:
    # Get the next chunk of QIDs from our iterator
    qids_in_chunk = list(itertools.islice(qids_iter, CHUNK_SIZE))
    if not qids_in_chunk:
        print("\n🏁 All QIDs processed.")
        break

    current_chunk_num += 1
    print(f"\n--- Processing Chunk {current_chunk_num} ({len(qids_in_chunk):,} QIDs) ---")

    # Check cache to see which QIDs we already have
    cached_entities = cache_get_entity_min(qids_in_chunk)
    missing_qids = [q for q in qids_in_chunk if q not in cached_entities]
    print(f"🔍 Cache hit: {len(cached_entities):,}. Missing: {len(missing_qids):,}.")

    # Fetch missing QIDs from API in batches and save them to the cache
    if missing_qids:
        all_value_qids_in_chunk = set()
        for i in tqdm(range(0, len(missing_qids), BATCH_SIZE), desc="Fetching from Wikidata"):
            batch = missing_qids[i:i + BATCH_SIZE]
            new_rows, value_qids = wd_get_enriched_entities(batch, lang=LANG)
            if new_rows:
                cache_put_entity_min(new_rows)
                all_value_qids_in_chunk.update(value_qids)
            time.sleep(API_SLEEP)
        
        # After fetching data, ensure all necessary labels are also cached
        cached_labels = cache_get_labels(list(all_value_qids_in_chunk), lang=LANG)
        missing_labels = [q for q in all_value_qids_in_chunk if q not in cached_labels]
        if missing_labels:
            for i in tqdm(range(0, len(missing_labels), BATCH_SIZE), desc="Fetching labels"):
                batch = missing_labels[i:i + BATCH_SIZE]
                labels = wd_get_labels(batch, lang=LANG)
                if labels:
                    cache_put_labels(labels, lang=LANG)
                time.sleep(API_SLEEP)

    # Assemble the final enriched data for this chunk (from the cache) and save it
    final_chunk_data = cache_get_entity_min(qids_in_chunk)
    out_path = TMP_ENRICHED_DIR / f"enriched_chunk_{current_chunk_num:04d}.csv"
    pd.DataFrame.from_records(list(final_chunk_data.values())).to_csv(out_path, index=False)
    print(f"✅ Chunk {current_chunk_num} saved to {out_path.name}")

▶️ Resuming from row 0 (found 0 completed chunks).

--- Processing Chunk 1 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/174 [00:00<?, ?it/s]

✅ Chunk 1 saved to enriched_chunk_0001.csv

--- Processing Chunk 2 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/86 [00:00<?, ?it/s]

✅ Chunk 2 saved to enriched_chunk_0002.csv

--- Processing Chunk 3 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/65 [00:00<?, ?it/s]

✅ Chunk 3 saved to enriched_chunk_0003.csv

--- Processing Chunk 4 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/57 [00:00<?, ?it/s]

✅ Chunk 4 saved to enriched_chunk_0004.csv

--- Processing Chunk 5 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/50 [00:00<?, ?it/s]

✅ Chunk 5 saved to enriched_chunk_0005.csv

--- Processing Chunk 6 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/54 [00:00<?, ?it/s]

✅ Chunk 6 saved to enriched_chunk_0006.csv

--- Processing Chunk 7 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/48 [00:00<?, ?it/s]

✅ Chunk 7 saved to enriched_chunk_0007.csv

--- Processing Chunk 8 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/43 [00:00<?, ?it/s]

✅ Chunk 8 saved to enriched_chunk_0008.csv

--- Processing Chunk 9 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/39 [00:00<?, ?it/s]

✅ Chunk 9 saved to enriched_chunk_0009.csv

--- Processing Chunk 10 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/40 [00:00<?, ?it/s]

✅ Chunk 10 saved to enriched_chunk_0010.csv

--- Processing Chunk 11 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/39 [00:00<?, ?it/s]

✅ Chunk 11 saved to enriched_chunk_0011.csv

--- Processing Chunk 12 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/37 [00:00<?, ?it/s]

✅ Chunk 12 saved to enriched_chunk_0012.csv

--- Processing Chunk 13 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/35 [00:00<?, ?it/s]

✅ Chunk 13 saved to enriched_chunk_0013.csv

--- Processing Chunk 14 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/32 [00:00<?, ?it/s]

✅ Chunk 14 saved to enriched_chunk_0014.csv

--- Processing Chunk 15 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/35 [00:00<?, ?it/s]

✅ Chunk 15 saved to enriched_chunk_0015.csv

--- Processing Chunk 16 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/29 [00:00<?, ?it/s]

✅ Chunk 16 saved to enriched_chunk_0016.csv

--- Processing Chunk 17 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/27 [00:00<?, ?it/s]

✅ Chunk 17 saved to enriched_chunk_0017.csv

--- Processing Chunk 18 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/29 [00:00<?, ?it/s]

✅ Chunk 18 saved to enriched_chunk_0018.csv

--- Processing Chunk 19 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/27 [00:00<?, ?it/s]

✅ Chunk 19 saved to enriched_chunk_0019.csv

--- Processing Chunk 20 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/26 [00:00<?, ?it/s]

✅ Chunk 20 saved to enriched_chunk_0020.csv

--- Processing Chunk 21 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/26 [00:00<?, ?it/s]

✅ Chunk 21 saved to enriched_chunk_0021.csv

--- Processing Chunk 22 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/26 [00:00<?, ?it/s]

✅ Chunk 22 saved to enriched_chunk_0022.csv

--- Processing Chunk 23 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/25 [00:00<?, ?it/s]

✅ Chunk 23 saved to enriched_chunk_0023.csv

--- Processing Chunk 24 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/24 [00:00<?, ?it/s]

✅ Chunk 24 saved to enriched_chunk_0024.csv

--- Processing Chunk 25 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/25 [00:00<?, ?it/s]

✅ Chunk 25 saved to enriched_chunk_0025.csv

--- Processing Chunk 26 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/23 [00:00<?, ?it/s]

✅ Chunk 26 saved to enriched_chunk_0026.csv

--- Processing Chunk 27 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/25 [00:00<?, ?it/s]

✅ Chunk 27 saved to enriched_chunk_0027.csv

--- Processing Chunk 28 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/27 [00:00<?, ?it/s]

✅ Chunk 28 saved to enriched_chunk_0028.csv

--- Processing Chunk 29 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/25 [00:00<?, ?it/s]

✅ Chunk 29 saved to enriched_chunk_0029.csv

--- Processing Chunk 30 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/24 [00:00<?, ?it/s]

✅ Chunk 30 saved to enriched_chunk_0030.csv

--- Processing Chunk 31 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/24 [00:00<?, ?it/s]

✅ Chunk 31 saved to enriched_chunk_0031.csv

--- Processing Chunk 32 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Chunk 32 saved to enriched_chunk_0032.csv

--- Processing Chunk 33 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/22 [00:00<?, ?it/s]

✅ Chunk 33 saved to enriched_chunk_0033.csv

--- Processing Chunk 34 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/22 [00:00<?, ?it/s]

✅ Chunk 34 saved to enriched_chunk_0034.csv

--- Processing Chunk 35 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Chunk 35 saved to enriched_chunk_0035.csv

--- Processing Chunk 36 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Chunk 36 saved to enriched_chunk_0036.csv

--- Processing Chunk 37 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/22 [00:00<?, ?it/s]

✅ Chunk 37 saved to enriched_chunk_0037.csv

--- Processing Chunk 38 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Chunk 38 saved to enriched_chunk_0038.csv

--- Processing Chunk 39 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Chunk 39 saved to enriched_chunk_0039.csv

--- Processing Chunk 40 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Chunk 40 saved to enriched_chunk_0040.csv

--- Processing Chunk 41 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Chunk 41 saved to enriched_chunk_0041.csv

--- Processing Chunk 42 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/21 [00:00<?, ?it/s]

✅ Chunk 42 saved to enriched_chunk_0042.csv

--- Processing Chunk 43 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/18 [00:00<?, ?it/s]

✅ Chunk 43 saved to enriched_chunk_0043.csv

--- Processing Chunk 44 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Chunk 44 saved to enriched_chunk_0044.csv

--- Processing Chunk 45 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/18 [00:00<?, ?it/s]

✅ Chunk 45 saved to enriched_chunk_0045.csv

--- Processing Chunk 46 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Chunk 46 saved to enriched_chunk_0046.csv

--- Processing Chunk 47 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/19 [00:00<?, ?it/s]

✅ Chunk 47 saved to enriched_chunk_0047.csv

--- Processing Chunk 48 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/17 [00:00<?, ?it/s]

✅ Chunk 48 saved to enriched_chunk_0048.csv

--- Processing Chunk 49 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/17 [00:00<?, ?it/s]

✅ Chunk 49 saved to enriched_chunk_0049.csv

--- Processing Chunk 50 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/17 [00:00<?, ?it/s]

✅ Chunk 50 saved to enriched_chunk_0050.csv

--- Processing Chunk 51 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/15 [00:00<?, ?it/s]

✅ Chunk 51 saved to enriched_chunk_0051.csv

--- Processing Chunk 52 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/13 [00:00<?, ?it/s]

✅ Chunk 52 saved to enriched_chunk_0052.csv

--- Processing Chunk 53 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/13 [00:00<?, ?it/s]

✅ Chunk 53 saved to enriched_chunk_0053.csv

--- Processing Chunk 54 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Chunk 54 saved to enriched_chunk_0054.csv

--- Processing Chunk 55 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/12 [00:00<?, ?it/s]

✅ Chunk 55 saved to enriched_chunk_0055.csv

--- Processing Chunk 56 (20,000 QIDs) ---
🔍 Cache hit: 0. Missing: 20,000.


Fetching from Wikidata:   0%|          | 0/400 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/13 [00:00<?, ?it/s]

✅ Chunk 56 saved to enriched_chunk_0056.csv

--- Processing Chunk 57 (5,607 QIDs) ---
🔍 Cache hit: 0. Missing: 5,607.


Fetching from Wikidata:   0%|          | 0/113 [00:00<?, ?it/s]

Fetching labels:   0%|          | 0/4 [00:00<?, ?it/s]

✅ Chunk 57 saved to enriched_chunk_0057.csv

🏁 All QIDs processed.


In [8]:
# Cell 6: Normalization 

# --- 1. Helper Function to Parse QID Strings ---
def parse_qids_pipe(value: str) -> list:
    """Safely splits a pipe-separated string of QIDs into a list."""
    if pd.isna(value) or value == "":
        return []
    return [item.strip() for item in str(value).split('|') if item.strip()]

# --- 2. Normalization Dictionaries and Functions ---

# GENDER NORMALIZATION
GENDER_MAP = {
    "Q6581097": "male", "Q6581072": "female", "Q1052281": "trans woman",
    "Q2449503": "trans man", "Q48270": "non-binary", "Q1097630": "intersex"
}
def normalize_gender(qids: list) -> str:
    priority = ["trans woman", "trans man", "non-binary", "male", "female", "intersex"]
    seen_genders = {GENDER_MAP[q] for q in qids if q in GENDER_MAP}
    if not seen_genders: return "unknown"
    for p in priority:
        if p in seen_genders: return p
    return sorted(seen_genders)[0]

# COUNTRY NORMALIZATION (with Place of Birth Fallback)
COUNTRY_SYNONYMS = {
    "United States of America": "United States", "USA": "United States",
    "United Kingdom": "United Kingdom", "Great Britain": "United Kingdom",
    "Russian Federation": "Russia", "People's Republic of China": "China"
}
def normalize_country(country_qids, pob_qids, label_cache) -> str:
    def get_cleaned_labels(qids):
        labels = [label_cache.get(q) for q in qids]
        return [COUNTRY_SYNONYMS.get(lbl, lbl) for lbl in labels if lbl]

    for qid_list in [country_qids, pob_qids]:
        labels = get_cleaned_labels(qid_list)
        if labels: return labels[0]
    return "unknown"

# OCCUPATION NORMALIZATION 
OCC_SYNONYMS = {
    "footballer": "association football player", "soccer player": "association football player",
    "actress": "actor", "movie actor": "actor", "film actor": "actor",
    "author": "writer", "novelist": "writer",
    "businessman": "businessperson", "businesswoman": "businessperson",
    "doctor": "physician", "surgeon": "physician"
}
def normalize_occupation(qids: list, label_cache) -> str:
    """Returns a canonical primary occupation from a list of occupation QIDs."""
    if not qids: return "unknown"
    
    # Safely get and clean labels, skipping any that are None
    cleaned_labels = []
    for q in qids:
        label = label_cache.get(q)
        if label: # This check prevents the error on None values
            cleaned_labels.append(label.lower())
            
    norm_labels = [OCC_SYNONYMS.get(lbl, lbl) for lbl in cleaned_labels if lbl]
    return norm_labels[0] if norm_labels else "unknown"


# --- 3. Processing Loop with Stats Collection ---
print("\n--- Applying Normalization and Collecting Stats ---")

enriched_files = sorted(TMP_ENRICHED_DIR.glob("enriched_chunk_*.csv"))
if not enriched_files:
    print("⚠️ No enriched files found to normalize. Please run the previous cell first.")
else:
    all_value_qids = set()
    for f in enriched_files:
        df = pd.read_csv(f, keep_default_na=False)
        for col in ["gender_qids", "country_qids", "occupation_qids", "pob_qids"]:
            if col in df.columns:
                df[col].apply(lambda x: all_value_qids.update(parse_qids_pipe(x)))

    print(f"Building master label cache for {len(all_value_qids):,} unique QIDs...")
    cached_labels = cache_get_labels(list(all_value_qids), lang=LANG)
    missing_labels = [q for q in all_value_qids if q not in cached_labels]
    if missing_labels:
        for i in tqdm(range(0, len(missing_labels), BATCH_SIZE), desc="Fetching final labels"):
            batch = missing_labels[i:i + BATCH_SIZE]
            labels = wd_get_labels(batch, lang=LANG)
            if labels: cache_put_labels(labels, lang=LANG)
    
    LABEL_CACHE = cache_get_labels(list(all_value_qids), lang=LANG)
    print("✅ Master label cache complete.")

    gender_counts, country_counts, occupation_counts = Counter(), Counter(), Counter()

    for f in tqdm(enriched_files, desc="Normalizing chunks"):
        df = pd.read_csv(f, keep_default_na=False)
        out_path = TMP_NORMALIZED_DIR / f.name.replace("enriched_", "normalized_")

        df["gender"] = df["gender_qids"].apply(parse_qids_pipe).apply(normalize_gender)
        df["country"] = df.apply(
            lambda row: normalize_country(
                parse_qids_pipe(row.get("country_qids", "")),
                parse_qids_pipe(row.get("pob_qids", "")),
                LABEL_CACHE), axis=1)
        df["occupation"] = df["occupation_qids"].apply(parse_qids_pipe).apply(
            lambda qids: normalize_occupation(qids, LABEL_CACHE))

        gender_counts.update(df["gender"])
        country_counts.update(df["country"])
        occupation_counts.update(df["occupation"])

        df[["qid", "title", "gender", "country", "occupation"]].to_csv(out_path, index=False)

    print("\n🏁 Normalization processing complete. Generating preview...")
    
    # --- 4. Generate and Display Preview ---
    total_rows = sum(gender_counts.values())
    
    print("\n--- Data Quality Preview ---")
    
    unknown_gender_pct = (gender_counts['unknown'] / total_rows) * 100
    unknown_country_pct = (country_counts['unknown'] / total_rows) * 100
    unknown_occupation_pct = (occupation_counts['unknown'] / total_rows) * 100
    
    print(f"\nPercentage of Unknown Values:")
    print(f"  - Gender:     {unknown_gender_pct:.2f}%")
    print(f"  - Country:    {unknown_country_pct:.2f}% (after fallback to place of birth)")
    print(f"  - Occupation: {unknown_occupation_pct:.2f}%")
        
    print("\nTop 10 Countries:")
    for i, (country, count) in enumerate(country_counts.most_common(10)):
        pct = (count / total_rows) * 100
        print(f"  {i+1}. {country:<20} | {count:>8,} ({pct:.2f}%)")
        
    print("\nTop 20 Occupations:")
    for i, (occ, count) in enumerate(occupation_counts.most_common(20)):
        pct = (count / total_rows) * 100
        print(f"  {i+1:02}. {occ:<30} | {count:>8,} ({pct:.2f}%)")


--- Applying Normalization and Collecting Stats ---
Building master label cache for 83,585 unique QIDs...
✅ Master label cache complete.


Normalizing chunks:   0%|          | 0/57 [00:00<?, ?it/s]


🏁 Normalization processing complete. Generating preview...

--- Data Quality Preview ---

Percentage of Unknown Values:
  - Gender:     0.41%
  - Country:    13.18% (after fallback to place of birth)
  - Occupation: 4.57%

Top 10 Countries:
  1. United States        |  221,066 (19.64%)
  2. unknown              |  148,389 (13.18%)
  3. United Kingdom       |   72,514 (6.44%)
  4. Canada               |   36,260 (3.22%)
  5. India                |   32,609 (2.90%)
  6. Australia            |   29,574 (2.63%)
  7. France               |   28,271 (2.51%)
  8. Germany              |   27,240 (2.42%)
  9. Japan                |   25,506 (2.27%)
  10. Italy                |   20,771 (1.85%)

Top 20 Occupations:
  01. association football player    |  177,900 (15.80%)
  02. politician                     |  108,109 (9.60%)
  03. actor                          |   53,606 (4.76%)
  04. unknown                        |   51,492 (4.57%)
  05. writer                         |   29,906 (2.66%)
  0