In [1]:
# Standard and third-party library imports
import pandas as pd
from os.path import join
from tqdm import tqdm
from sys import stdout

from typing import List  # For type hinting in functions that return lists

# Define the namespaces to be excluded
EXCLUDED_PREFIXES = [
    "Wikipédia:",
    "Categoria:",
    "Predefinição:",
    "Ficheiro:",
    "Portal:",
    "Módulo:",
    "Tópico:",
    "Ajuda:",
    "MediaWiki:",
    "Livro:",
    "TimedText:",
]


def filter_redirects(redirect_array: List[str], page_title_lower: set):
    result = []
    for redirect in redirect_array:
        redirect_ref = redirect.split("|")[0].split("#")[0].strip()

        # Check if the redirect does not start with any excluded prefix
        if not any(redirect_ref.startswith(prefix) for prefix in EXCLUDED_PREFIXES):
            if redirect_ref.lower() in page_title_lower:
                result.append(redirect_ref)

    return result


def process_page(wikinamedate: str) -> None:
    print(f"[INFO] Reading '{wikinamedate.replace('/', '-')}/raw.parquet'")
    df = pd.read_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "raw.parquet")
    )

    print("[INFO] Filtering namespace")
    df = df[df["Page Namespace"] == "0"]

    tqdm.pandas(
        desc="[INFO] Filtering redirects", unit_scale=True, unit=" pages", file=stdout
    )
    page_title_lower = set(df["Page Title"].str.lower().values)
    df["Page References"] = df["Page References"].progress_apply(
        lambda redirect_array: filter_redirects(redirect_array, page_title_lower)
    )

    df.to_parquet(
        join("../output/", wikinamedate.replace("/", "-"), "processed.parquet")
    )

    print(
        f"[INFO] {df['Page References'].apply(lambda x: len(x) > 0).sum()} out of {df['Page Title'].count()} ({round(100 * (df['Page References'].apply(lambda x: len(x) > 0).sum()/df['Page Title'].count()), 2)}%) pages have at least one Page Reference"
    )

    del page_title_lower, df

    print()


In [2]:
%%time
# Restore variable from different Jupyter notebook
%store -r wikinamedate

process_page(wikinamedate)


[INFO] Reading 'ptwiki-20240720/raw.parquet'
[INFO] Filtering namespace
[INFO] Filtering redirects: 100%|██████████| 1.91M/1.91M [04:52<00:00, 6.53k pages/s]
[INFO] 1135416 out of 1909778 (59.45%) pages have at least one Page Reference
CPU times: user 5min 53s, sys: 8.89 s, total: 6min 2s
Wall time: 5min 46s
