In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from wiki_dump_extractor import WikiAvroDumpExtractor, page_utils
from pathlib import Path
from tqdm.auto import tqdm
from utils.db_utils import LMDBReader, LMDBWriter
import json
from rapidfuzz import process, fuzz
from rapidfuzz.process import cdist
import itertools
import re

import numpy as np


def score_section(section, event):
    small_terms = [
        event["when"],
        event["what"],
        event["who"],
        event["where"],
    ]  #  event["where"], event["city"]
    results = process.extract(
        section.text,  # the big string
        small_terms,  # list of small strings
        scorer=fuzz.partial_ratio,
        limit=None,  # return all matches, not just the top 5
    )
    scores = [score for _, score, _ in results]
    return sum(scores)


def score_all_sections(section, event):
    score = score_section(section, event)
    result = {section.title: score}
    for sub_section in section.children:
        result.update(score_all_sections(sub_section, event))
    return result


def extract_page_section_words(page_text):
    section = page_utils.Section.from_page_text(page_text)
    text_by_section = section.all_subsections_text_dict()

    sections_and_words = [
        (
            section_title,
            sorted(set(re.findall(r"\w+", section_text.lower()))),
        )
        for section_title, section_text in text_by_section.items()
    ]
    sections_titles, sections_words = zip(*sections_and_words)
    return sections_titles, sections_words


def extract_event_words(event):
    event_text = " ".join([event["what"], event["where"], event["who"], event["when"]])
    event_words = sorted(set(re.findall(r"\w+", event_text.lower())))
    return event_words


def attribute_section_to_events(events, page_text):
    sections_titles, sections_words = extract_page_section_words(page_text)
    sections_titles = [
        section_title
        for (section_title, words) in zip(sections_titles, sections_words)
        if len(words) > 0
    ]
    sections_words = [words for words in sections_words if len(words) > 0]
    event_words = [extract_event_words(event) for event in events]

    all_section_words = sorted(set([w for sw in sections_words for w in sw]))
    all_event_words = sorted(set([w for words in event_words for w in words]))
    # Create word-to-index mappings to avoid expensive np.isin calls
    section_word_to_idx = {word: idx for idx, word in enumerate(all_section_words)}
    event_word_to_idx = {word: idx for idx, word in enumerate(all_event_words)}
    big_grid = cdist(all_event_words, all_section_words, score_cutoff=75, workers=-1)
    # Pre-compute section indices to avoid repeated np.isin calls
    section_indices = []
    for section_words_list in sections_words:
        # Use dictionary lookup instead of np.isin
        indices = [
            section_word_to_idx[word]
            for word in section_words_list
            if word in section_word_to_idx
        ]
        section_indices.append(indices)

    # Pre-compute all event word indices once
    event_indices = []
    for words_list in event_words:
        indices = [
            event_word_to_idx[word] for word in words_list if word in event_word_to_idx
        ]
        event_indices.append(indices)

    # Create section grids using pre-computed indices
    big_grid_by_section = [big_grid[:, indices] for indices in section_indices]

    for i, (event, ev_indices) in enumerate(zip(events, event_indices)):
        # Use pre-computed event indices
        section_grids_for_event = []
        for section_grid in big_grid_by_section:
            # Use pre-computed indices directly
            event_grid = section_grid[ev_indices]
            section_grids_for_event.append(event_grid.max(axis=1).T)

        section_grids_for_event = np.vstack(section_grids_for_event)
        scores = (
            section_grids_for_event / np.maximum(1, section_grids_for_event.sum(axis=0))
        ).sum(axis=1)
        best_score_index = scores.argmax()
        event["section"] = sections_titles[best_score_index]


def batch_iterator(iterable, batch_size=1000):
    """
    Creates batches from an iterator with specified batch size.

    Args:
        iterable: The iterator to batch
        batch_size: Size of each batch

    Returns:
        Iterator of batches
    """
    iterator = iter(iterable)
    while True:
        batch = list(itertools.islice(iterator, batch_size))
        if not batch:
            break
        yield batch


def remove_date_outliers(events):
    events_years = [event["when"].split("/")[0] for event in events]
    events_years = [
        -int(year[:-3]) if year.endswith("BC") else int(year) for year in events_years
    ]
    year_mean, year_std = np.mean(events_years), np.std(events_years)
    tol = 3 * year_std
    filtered_events = [
        event
        for (event, year) in zip(events, events_years)
        if (year < -10) or (year > 31) or (year_mean - tol < year < year_mean + tol)
    ]
    n_removed = len(events) - len(filtered_events)
    if n_removed > 0:
        print(f"Removed {n_removed} events")
    return filtered_events

In [7]:
generated_data_dir = Path("generated_data")
wiki_data_dir = Path("wikipedia_data")
dump = WikiAvroDumpExtractor(
    wiki_data_dir / "wiki_dump.avro", index_dir=wiki_data_dir / "wiki_dump_index_db"
)
target = generated_data_dir / "events_extracted_by_page_gemini-2.0_processed_db"

if True or not target.exists():
    with LMDBWriter(target) as target_db:
        with LMDBReader(
            generated_data_dir / "events_extracted_by_page_gemini-2.0-flash_lmdb"
        ) as events_db:
            iterator = (
                (page, events)
                for page, events in events_db
                if target_db.get(page.encode()) is None
            )
            batches = (itertools.islice(iterator, 1000) for _ in itertools.count())
            for batch in tqdm(batches):
                batch = list(batch)
                if len(batch) == 0:
                    break
                batch = list(batch)
                page_titles = [title for title, _ in batch]
                page_texts_by_title = {
                    page.title: page.text
                    for page in dump.get_page_batch_by_title(
                        page_titles, ignore_titles_not_found=True
                    )
                }
                results_batch = []
                for page_title, events in batch:
                    try:
                        events = json.loads(events.decode())
                    except Exception:
                        continue

                    page_text = page_texts_by_title[page_title.decode()]
                    attribute_section_to_events(events, page_text)
                    results_batch.append(
                        (page_title.encode(), json.dumps(events).encode())
                    )

                target_db.write_batch(results_batch)


Exception ignored in: <generator object LMDBReader.__iter__ at 0x1043a0f40>
Traceback (most recent call last):
  File "/Users/valentin/Documents/programming/landnotes/landnotes-data/utils/db_utils.py", line 49, in __iter__
    with self.db.begin() as txn:
lmdb.Error: Attempt to operate on closed/deleted/dropped object.


0it [00:00, ?it/s]

Exception ignored in: <generator object LMDBReader.__iter__ at 0x1147fab60>
Traceback (most recent call last):
  File "/Users/valentin/Documents/programming/landnotes/landnotes-data/utils/db_utils.py", line 49, in __iter__
lmdb.Error: Attempt to operate on closed/deleted/dropped object.


ValueError: 3 pages not found in index:first ones are ['$1 Million Challenge', '1 World Trade Center (1971–2001)', '1348 Friuli earthquake']

0it [00:00, ?it/s]