# Extract events (births, deaths, openings, releases...) from infoboxes


In [1]:
%load_ext autoreload
%autoreload 2

## Gather all infoboxes with at least one date and one location


In [2]:
from wiki_dump_extractor import page_utils
from tqdm.auto import tqdm
from pathlib import Path
import json
from utils import db_utils
import zlib

wiki_data_dir = Path("wikipedia_data/")
generated_data_dir = Path("generated_data/")

date_keywords = [
    "date",
    "released",
    "published",
    "founded",
    "established",
    "_start",
    "_end",
    "year",
    "build",
    "opened",
]

place_keywords = ["place", "location", "city", "coordinates", "released"]
output_dir = Path("infoboxes/")


def process_infobox(page_title, data):
    data["page_title"] = page_title
    keys = set(data.keys())

    has_date = any([word in k for k in keys for word in date_keywords])
    has_location = any(
        [(word in k) for k in keys if ("replace" not in k) for word in place_keywords]
    )
    if has_date and has_location:
        return data


infoboxes_with_date_and_place_db = (
    generated_data_dir / "infoboxes_with_date_and_place_db"
)
parsed_infoboxes_db = generated_data_dir / "parsed_infoboxes_db"
if not infoboxes_with_date_and_place_db.exists():
    counter = 0
    with db_utils.LMDBWriter(infoboxes_with_date_and_place_db) as db:
        with db_utils.LMDBReader(parsed_infoboxes_db) as infoboxes_db:
            selected_infoboxes = []
            for page_title, zipped_infobox_data in tqdm(infoboxes_db):
                decompressed = zlib.decompress(zipped_infobox_data)
                infobox_data = json.loads(decompressed.decode())
                processed_data = process_infobox(page_title, infobox_data)
                compressed = zlib.compress(json.dumps(processed_data).encode())
                if processed_data is not None:
                    selected_infoboxes.append((page_title.encode(), compressed))
                    counter += 1
            db.write_batch(selected_infoboxes)
    print(len(selected_infoboxes))


## Make a list of wikipedia people

(pages whose infobox has a birth date)


In [3]:
people_pages_json = generated_data_dir / "people_pages_db,json"
redirects_db_path = wiki_data_dir / "wiki_dump_redirects_db"
if not people_pages_json.exists():
    counters = {
        "birth_event_possible": 0,
        "death_event_possible": 0,
        "event_page": 0,
        "building_page": 0,
    }
    people_pages = set()
    with db_utils.LMDBReader(infoboxes_with_date_and_place_db) as db:
        for key, value in tqdm(db):
            decompressed = zlib.decompress(value)
            data = json.loads(decompressed.decode())
            if "birth_date" in data and "birth_place" in data:
                counters["birth_event_possible"] += 1
                people_pages.add(key)
            if "death_date" in data and "death_place" in data:
                counters["death_event_possible"] += 1
                people_pages.add(key)
            if "date" in data and ("location" in data or "coordinates" in data):
                counters["event_page"] += 1
            if ("established_date" in data) or ("built_date" in data):
                counters["building_page"] += 1
    n_redirects = 0
    with db_utils.LMDBReader(redirects_db_path) as redirects_db:
        for key, value in tqdm(redirects_db):
            value = value.decode()
            if value in people_pages:
                n_redirects += 1
                people_pages.add(key)
    print(counters)
    print("Detected people pages:", len(people_pages))
    with people_pages_json.open("w") as f:
        json.dump(list(people_pages), f)

people_pages = set(json.loads(people_pages_json.read_text()))

In [None]:
from wiki_dump_extractor import date_utils


def extract_date_from_infobox_value(value):
    value = page_utils.remove_comments_and_citations(value)
    parsed_dates, errors = date_utils.extract_dates(value)
    dedup_dict = {(d.date.year, d.date.month, d.date.day): d for d in parsed_dates}
    dedup_dates = list(dedup_dict.values())
    if len(dedup_dates) == 0:
        return

    full_formats = [
        "WIKI_BIRTH_DATE",
        "DASH_YMD",
        "SLASH_DMY_MDY",
        "MONTH_DAY_YEAR",
    ]
    dedup_dates = [d for d in dedup_dates if d.format in full_formats]
    if len(dedup_dates) != 1:
        return
    return dedup_dates[0].date


def extract_locations_from_infobox_value(value, locations_by_page_title_db):
    value = page_utils.remove_comments_and_citations(value)
    if value.strip() == "":
        return []
    locations = []
    links = page_utils.extract_links(value)
    for link in links.values():
        if isinstance(link, str):
            maybe_location = locations_by_page_title_db.get(link.encode())
            if maybe_location is not None:
                locations.append(json.loads(maybe_location.decode()))
    if len(locations) == 0:
        values = value.split(",")
        for value in (",".join(values[:-i]) for i in range(len(values))):
            if value.strip() == "":
                continue

            maybe_location = locations_by_page_title_db.get(value.encode())
            if maybe_location is not None:
                maybe_location = json.loads(maybe_location.decode())
                locations.append(maybe_location)
                break
    return locations


def extract_people_from_infobox_value(value):
    value = page_utils.remove_comments_and_citations(value)
    if value.strip() == "":
        return []
    people = []
    links = page_utils.extract_links(value)
    for link in links.values():
        if isinstance(link, str):
            link = [link]
            for mylink in link:
                if mylink in people_pages:
                    people.append(mylink)
    return people


def find_historical_events(data, locations_by_page_title_db, page_counter):
    location = data.get("location", None)
    date = data.get("date", None)
    if (location is None) or (date is None):
        return []
    places = extract_locations_from_infobox_value(
        location,
        locations_by_page_title_db=locations_by_page_title_db,
    )
    date = extract_date_from_infobox_value(date)
    if (places == []) or (date is None):
        return []
    records = []
    prefix = data["page_title"].replace(" ", "_")
    page_counter["count"] += 1
    event_id = f"{prefix}_infobox_0"
    record = {
        "page_title": data["page_title"],
        "event_id": event_id,
        "event_type": "historical_event",
        "date": date.to_string(),
        "place": places,
        "event_category": "historical_page",
    }
    records.append(record)
    return records


def find_place_events(data, locations_by_page_title_db, page_counter):
    place_data = locations_by_page_title_db.get(data["page_title"].encode())
    if place_data is None:
        return []
    place_data = json.loads(place_data.decode())
    records = []
    ignored = [
        "image",
        "access",
        "url",
        "coordinates",
        "archivedate",
        "address",
    ]
    for key, value in data.items():
        if any([word in key.lower() for word in ignored]):
            continue
        date = extract_date_from_infobox_value(value)
        if date is not None:
            page_counter["count"] += 1
            prefix = data["page_title"].replace(" ", "_")
            event_id = f"{prefix}_infobox_{page_counter['count']:03d}"
            record = {
                "page_title": data["page_title"],
                "event_id": event_id,
                "event_type": key.replace("_", " "),
                "date": date.to_string(),
                "place": [place_data],
                "event_category": "place_page",
                "people": [],
            }
            records.append(record)

    return records


def find_date_and_place_type_events(data, locations_by_page_title_db, page_counter):
    datefields = [s.replace("_date", "") for s in data.keys() if s.endswith("_date")]
    placefields = [s.replace("_place", "") for s in data.keys() if s.endswith("_place")]
    intersection = set(datefields).intersection(set(placefields))
    records = []

    for event in intersection:
        date = extract_date_from_infobox_value(data[event + "_date"])
        if date is None:
            continue
        places = extract_locations_from_infobox_value(
            data[event + "_place"],
            locations_by_page_title_db=locations_by_page_title_db,
        )
        if len(places) == 0:
            continue
        prefix = data["page_title"].replace(" ", "_")
        page_counter["count"] += 1
        event_id = f"{prefix}_infobox_{page_counter['count']:03d}"
        record = {
            "page_title": data["page_title"],
            "event_id": event_id,
            "event_type": event.replace("_", " "),
            "date": date.to_string(),
            "place": places[
                0
            ],  # too often, the second place is the state, country, etc.
            "event_category": "date_and_place",
        }
        records.append(record)
    return records


events_extracted_from_infobox_db = (
    generated_data_dir / "events_extracted_from_infoboxes_db"
)
if not events_extracted_from_infobox_db.exists():
    counters = {
        "pages_with_infobox_events": 0,
        "n_events": 0,
    }
    with db_utils.LMDBWriter(events_extracted_from_infobox_db) as events_db:
        with db_utils.LMDBReader(
            generated_data_dir / "locations_by_page_title_db"
        ) as locations_by_page_title_db:
            with db_utils.LMDBReader(infoboxes_with_date_and_place_db) as infoboxes_db:
                batch = []
                for page_title, zipped_page_data in tqdm(infoboxes_db):
                    event_number = 0
                    data = json.loads(zlib.decompress(zipped_page_data).decode())

                    page_counter = {"count": 0}
                    records = []
                    date_place_events = find_date_and_place_type_events(
                        data, locations_by_page_title_db, page_counter
                    )
                    if date_place_events and (page_title in people_pages):
                        for record in date_place_events:
                            record["people"] = [page_title]
                    records += date_place_events

                    place_events = find_place_events(
                        data, locations_by_page_title_db, page_counter
                    )
                    records += place_events

                    historical_events = find_historical_events(
                        data,
                        locations_by_page_title_db,
                        page_counter,
                    )
                    if historical_events:
                        people_in_infobox = list(
                            set(
                                p
                                for val in data.values()
                                for p in extract_people_from_infobox_value(val)
                            )
                        )
                        for event in historical_events:
                            event["people"] = people_in_infobox
                        records += historical_events

                    if records:
                        counters["pages_with_infobox_events"] += 1
                        counters["n_events"] += len(records)
                        batch.append(
                            (page_title.encode(), json.dumps(records).encode())
                        )
                        if len(batch) > 1000:
                            events_db.write_batch(batch)
                            batch = []
                if batch:
                    events_db.write_batch(batch)
    counters

0it [00:00, ?it/s]

TypeError: Won't implicitly convert Unicode to bytes; use .encode()

In [12]:
counters


{'pages_with_infobox_events': 961707, 'n_events': 1245583}

In [13]:
import json
from db_utils import open_plyvel_db
from tqdm.auto import tqdm

counts = {"category": {}}
all_records = []
with open_plyvel_db(
    "events_extracted_by_page_gemini-2.0-flash_db", replace=False
) as llm_events_db:
    with open_plyvel_db(
        "events_extracted_from_infobox_db", replace=False
    ) as infobox_events_db:
        for key, value in tqdm(infobox_events_db.iterator()):
            counts["all_pages_with_infobox"] = (
                counts.get("all_pages_with_infobox", 0) + 1
            )
            data = json.loads(value.decode())
            if len(data) == 0:
                continue
            counts["nonempty_pages"] = counts.get("nonempty_pages", 0) + 1

            if llm_events_db.get(key) is not None:
                continue
            counts["nonempty_not_in_llm_database"] = (
                counts.get("nonempty_not_in_llm_database", 0) + 1
            )

            for record in data:
                counts["category"][record["event_category"]] = (
                    counts["category"].get(record["event_category"], 0) + 1
                )
                all_records.append(record)
counts

0it [00:00, ?it/s]

{'category': {'date_and_place': 943255,
  'place_page': 87811,
  'historical_page': 6510},
 'all_pages_with_infobox': 948630,
 'nonempty_pages': 948630,
 'nonempty_not_in_llm_database': 830659}