In [1]:
# Standard and third-party library imports
import pandas as pd
from os.path import join
from tqdm import tqdm  # For displaying progress bars
from sys import stdout

from typing import List  # For type hinting in functions that return lists

# Define the namespaces to be excluded
EXCLUDED_PREFIXES = [
    "Wikipédia:",  # Wikipedia internal pages
    "Categoria:",  # Category pages
    "Predefinição:",  # Template pages
    "Ficheiro:",  # File pages
    "Portal:",  # Portal pages
    "Módulo:",  # Module pages
    "Tópico:",  # Topic pages
    "Ajuda:",  # Help pages
    "MediaWiki:",  # MediaWiki system pages
    "Livro:",  # Book pages
    "TimedText:",  # Timed text pages
]


def filter_redirects(redirect_array: List[str], page_title_lower: set):
    """
    Filter out redirects that are not valid or relevant based on the page titles and excluded prefixes.

    Args:
        redirect_array (List[str]): A list of redirect links from a Wikipedia page.
        page_title_lower (set): A set of lowercased page titles to filter against.

    Returns:
        List[str]: A list of valid redirect references after filtering.
    """

    result = []
    for redirect in redirect_array:
        # Extract the main part of the redirect reference (before | or #)
        redirect_ref = redirect.split("|")[0].split("#")[0].strip()

        # Check if the redirect does not start with any excluded prefix
        if not any(redirect_ref.startswith(prefix) for prefix in EXCLUDED_PREFIXES):
            # If the reference exists in the page titles (case-insensitive), add to results
            if redirect_ref.lower() in page_title_lower:
                result.append(redirect_ref)

    return result


def process_page(wikinamedate: str) -> None:
    """
    Process a raw Parquet file to filter page data and save the results to another Parquet file.

    Args:
        wikinamedate (str): A string used to locate the input Parquet file and name the output file.
    """
    # Read the raw Parquet file for the given wikinamedate
    print(f"[INFO] Reading '{wikinamedate.replace('/', '-')}/raw.parquet'")
    df = pd.read_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "raw.parquet")
    )

    # Filter rows to include only those in the main namespace (Namespace 0)
    print("[INFO] Filtering namespace")
    df = df[df["Page Namespace"] == "0"]

    # Prepare for progress tracking in filtering redirects
    tqdm.pandas(
        desc="[INFO] Filtering redirects", unit_scale=True, unit=" pages", file=stdout
    )

    # Create a set of lowercased page titles to use for filtering redirects
    page_title_lower = set(df["Page Title"].str.lower().values)

    # Apply the filtering function to each row's "Page References" field
    df["Page References"] = df["Page References"].progress_apply(
        lambda redirect_array: filter_redirects(redirect_array, page_title_lower)
    )

    # Save the processed data to a new Parquet file
    df.to_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "processed.parquet")
    )

    # Log the number and percentage of pages that have at least one valid Page Reference
    print(
        f"[INFO] {df['Page References'].apply(lambda x: len(x) > 0).sum()} out of {df['Page Title'].count()} ({round(100 * (df['Page References'].apply(lambda x: len(x) > 0).sum()/df['Page Title'].count()), 2)}%) pages have at least one Page Reference"
    )

    # Clean up to free memory
    del page_title_lower, df

    print()


In [2]:
%%time
# Restore variable from different Jupyter notebook
%store -r wikinamedate

# Entry point: process a raw Parquet file to filter page data and save the results to another Parquet file.
process_page(wikinamedate)


[INFO] Reading 'ptwiki-20240720/raw.parquet'
[INFO] Filtering namespace
[INFO] Filtering redirects: 100%|██████████| 1.91M/1.91M [04:52<00:00, 6.53k pages/s]
[INFO] 1135416 out of 1909778 (59.45%) pages have at least one Page Reference

CPU times: user 5min 53s, sys: 8.89 s, total: 6min 2s
Wall time: 5min 46s
