# Wikipedia Dump Parser and Indexer

This Jupyter notebook contains Python code for parsing and indexing Wikipedia dump files. The script performs the following main tasks:

1. Counts the number of pages in the XML dump file
2. Extracts relevant information from each page
3. Processes revision links within page content
4. Saves the extracted data into a Parquet file for efficient storage and querying

## Features:

- Efficient XML parsing using ElementTree
- Regex-based extraction of revision links
- Memory-efficient processing using chunking
- Progress bar for tracking page processing
- Output in Parquet format for optimized storage and query performance

## Dependencies:

- os
- sys
- subprocess
- tqdm
- xml.etree.ElementTree
- pandas
- pyarrow
- pyre2 (falls back to re if not available)

This script is designed to work with the Wikipedia dump files downloaded and extracted in the previous section. It processes the XML data and creates a structured dataset for further analysis.

In [1]:
# Standard and third-party library imports
from os import path, makedirs

from sys import stdout
from subprocess import check_output
from tqdm import tqdm  # For displaying progress bars
import xml.etree.ElementTree as ET
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from typing import List  # For type hinting in functions that return lists

# Try importing the 're2' module for better regex performance; fall back to 're' if unavailable
# https://pypi.org/project/pyre2/
try:
    import re2 as re
except ImportError:
    import re

    print("[WARN] Not using re2")


def count_pages(filename: str) -> int:
    """
    Count the number of <page> elements in the XML file.

    Args:
        filename (str): The name of the XML file to count pages in.

    Returns:
        int: The number of <page> elements in the file.
    """

    print(f"[INFO] Counting how many \"<pages>\" in '{filename}'")

    # Run a shell command to count the occurrences of "<page>" in the file
    # -w: Match whole words only
    # -c: Print the number of matching lines
    # https://unix.stackexchange.com/questions/398413/counting-occurrences-of-word-in-text-file
    # https://linux.die.net/man/1/wc
    # https://docs.python.org/3/library/subprocess.html
    command = ["grep", "-wc", "<page>", path.join("../multistream/decompressed/", filename)]
    output = check_output(command).decode(stdout.encoding).strip()

    return int(output)


def revision(text: str) -> List[str]:
    """
    Extract revision links from the given text.

    Args:
        text (str): The text from which to extract revision links.

    Returns:
        List[str]: A list of revision links found in the text.
    """

    if text:
        # Find and remove nested "[[...]]" links
        while True:
            i = start = text.find("[[")
            while i != -1:
                # Find the next closing brackets or exit the loop
                if text[i + 2 :].find("[[") < text[i + 2 :].find("]]"):
                    i = text[i + 2 :].find("]]") + i + 2
                else:
                    break

            # Exit if no more nested brackets are found
            if start == i:
                break
            else:
                text = text[:start] + text[text[i + 2 :].find("]]") + i + 4 :]

        # Use regex to find all revision links in the cleaned text
        return re.findall(
            r"(?<!>)\[\[(.*?)\]\]", re.sub(r"{{.*?}}", "", re.sub(r"\(.*?\)", "", text))
        )
    else:
        return []


def index_pages(filename: str, wikinamedate: str) -> None:
    """
    Process the XML file to extract page information and save it to a Parquet file.

    Args:
        filename (str): The name of the XML file to process.
        wikinamedate (str): A string used to name the output Parquet file.
    """

    total_pages = count_pages(filename)  # Count the total number of pages in the file
    # Create an iterator for parsing XML elements
    context = iter(
        ET.iterparse(path.join("../multistream/decompressed/", filename), events=("end",))
    )
    chunk_size = 100_000  # Number of rows per chunk to be processed
    rows = []  # List to hold data for current chunk
    pqwriter = None  # Initialize Parquet writer

    # Create necessary directory for storing output files.
    # https://docs.python.org/3/library/os.html#os.makedirs
    makedirs(path.join("../output/", wikinamedate.replace("/", "-")), exist_ok=True)

    with tqdm(
        total=total_pages,
        unit=" pages",
        unit_scale=True,
        desc="[INFO] Processing pages",
        initial=0,
        file=stdout,
    ) as pbar:
        title, id, namespace, redirect = [
            None
        ] * 4  # Initialize variables for storing page details

        # Process XML elements
        for event, elem in context:
            match elem.tag:
                case "{http://www.mediawiki.org/xml/export-0.11/}title":
                    title = elem.text
                case "{http://www.mediawiki.org/xml/export-0.11/}ns":
                    namespace = elem.text
                case "{http://www.mediawiki.org/xml/export-0.11/}id":
                    if id is None:
                        id = elem.text
                case "{http://www.mediawiki.org/xml/export-0.11/}redirect":
                    redirect = elem.attrib["title"]
                case "{http://www.mediawiki.org/xml/export-0.11/}text":
                    # Append page details and revision links to the rows list
                    if redirect is None:
                        rows.append([title, id, namespace, revision(elem.text), False])
                    else:
                        rows.append([title, id, namespace, [redirect], True])

                    id = None  # Reset ID for the next page
                    redirect = None  # Reset redirect for the next page

                    pbar.update()  # Update progress bar

            elem.clear()  # Clear the element to free memory

            # Write chunk to Parquet file if chunk size is reached
            if len(rows) >= chunk_size:
                df_chunk = pd.DataFrame(
                    rows,
                    columns=[
                        "Page Title",
                        "Page ID",
                        "Page Namespace",
                        "Page References",
                        "Page Redirect",
                    ],
                )
                table = pa.Table.from_pandas(df_chunk)

                # Create a Parquet writer if not already created
                if pqwriter is None:
                    pqwriter = pq.ParquetWriter(
                        path.join(
                            "../output/", wikinamedate.replace("/", "-"), "raw.parquet"
                        ),
                        table.schema,
                    )

                pqwriter.write_table(table)  # Write the chunk to the Parquet file

                rows = []  # Clear rows to free memory

    # Save remaining rows if any
    if rows:
        # Write chunk to Parquet file if chunk size is reached
        df_chunk = pd.DataFrame(
            rows,
            columns=[
                "Page Title",
                "Page ID",
                "Page Namespace",
                "Page References",
                "Page Redirect",
            ],
        )
        table = pa.Table.from_pandas(df_chunk)

        # Create a Parquet writer if not already created
        if pqwriter is None:
            pqwriter = pq.ParquetWriter(
                path.join("../output/", wikinamedate.replace("/", "-"), "raw.parquet"),
                table.schema,
            )

        pqwriter.write_table(table)  # Write the chunk to the Parquet file

    # Close the Parquet writer if it was opened
    if pqwriter:
        pqwriter.close()

    # Clean up variables to free memory
    del (
        title,
        id,
        namespace,
        event,
        elem,
        context,
        rows,
        pqwriter,
        chunk_size,
        df_chunk,
        table,
    )

    print()


In [2]:
%%time
# Restore variable from different Jupyter notebook
%store -r filename wikinamedate

# Entry point: process the XML file to extract page information and save it to a Parquet file.
index_pages(filename, wikinamedate)


[INFO] Counting how many "<pages>" in 'ptwiki-20240720-pages-articles-multistream.xml'
[INFO] Processing pages: 100%|██████████| 2.63M/2.63M [19:15<00:00, 2.28k pages/s]

CPU times: user 19min, sys: 23.1 s, total: 19min 23s
Wall time: 19min 41s
