## Extração dos dumps

In [1]:
# Standard and third-party library imports
import hashlib
import os
import requests
import sys
import shutil
import subprocess
from tqdm import tqdm  # For displaying progress bars

from typing import (
    Tuple,
    Dict,
)  # For type hinting in functions that return multiple values or dictionaries


def create_multistream_dir() -> None:
    """
    Create necessary directories for storing compressed and decompressed multistream files.
    """

    dirs = ["../multistream/compressed", "../multistream/decompressed"]

    for path in dirs:
        # Create dir recursively
        # https://docs.python.org/3/library/os.html#os.makedirs
        os.makedirs(path, exist_ok=True)


def get_articles_details(dump_url: str) -> Dict[str, dict]:
    """
    Fetch the dump status and details from the provided URL and check if the dump is completed.

    Args:
        dump_url (str): URL to fetch dump details.

    Returns:
        Dict[str, dict]: Details of the articles multistream dump.
    """

    print("[INFO] Fetching dump info")

    # https://requests.readthedocs.io/en/latest/user/quickstart/#make-a-request
    resp = requests.get(url=dump_url)
    # https://requests.readthedocs.io/en/latest/user/quickstart/#json-response-content
    data = resp.json()

    # (ex) See https://dumps.wikimedia.org/ptwiki/20240720/dumpstatus.json
    articles_details = data["jobs"]["articlesmultistreamdumprecombine"]

    # Raise an exception if the dump is not completed ('done')
    check_articles_status(articles_details)

    return articles_details


def check_articles_status(articles_details: Dict[str, str]) -> None:
    """
    Raise an exception if the dump status is not 'done'.

    Args:
        articles_details (Dict[str, str]): Details of the articles multistream dump.

    Raises:
        Exception: If the status of the dump is not 'done'.
    """

    if articles_details["status"] != "done":
        raise Exception(
            f"'Article Multistream Dump Precombine' is not 'done' ({articles_details['status']})"
        )


def get_file(dump_url: str) -> Tuple[Dict[str, str], str]:
    """
    Retrieve the file details of the multistream dump and its filename.

    Args:
        dump_url (str): URL to fetch dump details.

    Returns:
        Tuple[Dict[str, str], str]: A tuple containing the file details and the filename.
    """

    # Fetch dump status and details for 'articlesmultistreamdumprecombine' and check if the dump is completed
    articles_details = get_articles_details(dump_url)

    # Find file from dumps.wikimedia.org from 'articlesmultistreamdumprecombine' that endings with '-pages-articles-multistream.xml.bz2'
    # Because there is also a '-pages-articles-multistream-index.txt.bz2' file included
    # (ex) See https://dumps.wikimedia.org/ptwiki/20240720/dumpstatus.json
    key = [
        file
        for file in articles_details["files"].keys()
        if file.endswith("-pages-articles-multistream.xml.bz2")
    ][0]
    file = articles_details["files"][key]

    # Extract the filename from the URL for later use
    filename = get_filename(file)

    return file, filename


def get_filename(file: Dict[str, str]) -> str:
    """
    Extract the filename from a file dictionary.

    Args:
        file (Dict[str, str]): Dictionary containing file details.

    Returns:
        str: Extracted filename from the URL.
    """

    # (ex) /ptwiki/20240720/ptwiki-20240720-pages-articles-multistream.xml.bz2 ->
    #   -> ptwiki-20240720-pages-articles-multistream.xml.bz2
    return file["url"].split("/")[-1]


def check_sha1(path: str, filename: str) -> str:
    """
    Compute the SHA1 checksum of the file.

    Args:
        path (str): Path to the file.
        filename (str): Name of the file (for logging purposes).

    Returns:
        str: SHA1 checksum of the file in hexadecimal format.
    """

    print(f"[INFO] Checking SHA1 Checksum of '{filename}'")

    # https://docs.python.org/3/library/hashlib.html#hashlib.sha1
    sha1 = hashlib.sha1()

    # https://stackoverflow.com/a/22058673
    with open(path, "rb") as f:
        while True:
            data = f.read(65536)  # BUFF_SIZE (arbitrary value)
            if not data:
                break
            sha1.update(data)

    # Return SHA1 in hex format
    # https://docs.python.org/3/library/hashlib.html#hashlib.hash.hexdigest
    return sha1.hexdigest()


def check_disk_space(file: Dict[str, str]) -> None:
    """
    Check if there is enough disk space to download the file.

    Args:
        file (Dict[str, int]): Dictionary containing file details with size.

    Raises:
        Exception: If there is not enough disk space available.
    """

    _, _, free = shutil.disk_usage("/")

    if free < file["size"]:
        raise Exception(f"Not enough disk space ({str(file['size'])} / {str(free)})")


def download_multistream(file: Dict[str, str], filename: str, path: str) -> None:
    """
    Download the multistream file and verify its integrity.

    Args:
        file (Dict[str, str]): Dictionary containing file details.
        filename (str): Name of the file to be downloaded.
        path (str): Path to save the downloaded file.
    """

    # Ensure there's enough free disk space to download the file
    check_disk_space(file)

    # (ex) 'https://dumps.wikimedia.org' + '/ptwiki/20240720/ptwiki-20240720-pages-articles-multistream.xml.bz2'
    url = "https://dumps.wikimedia.org" + file["url"]

    # Download file from dumps.wikimedia.org
    with requests.get(url, stream=True) as stream:
        # Check if the request was successful
        # https://3.python-requests.org/api/#requests.Response.raise_for_status
        stream.raise_for_status()
        # https://stackoverflow.com/a/44299915
        total_size = int(stream.headers.get("content-length", 0))

        # Initialize a progress bar to track the processing of pages
        # https://tqdm.github.io/docs/tqdm/
        with tqdm(
            total=total_size,
            unit="B",
            unit_scale=True,
            desc=f"[INFO] Downloading '{filename}'",
            initial=0,
            file=sys.stdout,
        ) as pbar:
            with open(path, mode="wb") as multistream_file:
                for chunk in stream.iter_content(chunk_size=10 * 1024):
                    multistream_file.write(chunk)
                    pbar.update(len(chunk)) # Update progress bar

    # Verify the integrity of the downloaded file by comparing its SHA1 checksum with the expected value
    sha1_hex = check_sha1(path, filename)
    if sha1_hex != file["sha1"]:
        raise Exception(f"SHA1 Checksum did not match ({sha1_hex} != {file['sha1']})")


def get_multistream_file(wikinamedate: str) -> str:
    """
    Ensure the multistream file is downloaded, decompressed, and ready for use.

    Args:
        wikinamedate (str): Wiki name and date to identify the dump.

    Returns:
        str: The filename of the decompressed multistream file.
    """

    # (ex) 'https://dumps.wikimedia.org/' + 'ptwiki/20240720' + '/dumpstatus.json'
    dump_url = "https://dumps.wikimedia.org/" + wikinamedate + "/dumpstatus.json"

    # Retrieve details of the relevant multistream dump file (filtered for '-pages-articles-multistream.xml.bz2')
    file, filename = get_file(dump_url)

    # Check if file was already downloaded and decompressed
    if os.path.isfile(os.path.join("../multistream/decompressed", filename[:-4])):
        print("[INFO] Valid matching multistream found in multistream folder")
    else:
        # Check if file was already downloaded (waiting to be decompressed)
        if os.path.isfile(os.path.join("../multistream/compressed", filename)):
            # Check if downloaded file isn't corrupted
            if (
                check_sha1(
                    os.path.join("../multistream/compressed", filename), filename
                )
                != file["sha1"]
            ):
                print("[INFO] Invalid matching multistream found in multistream folder")

                # Delete file from '../multistream/compressed' folder
                os.remove(os.path.join("../multistream/compressed", filename))

                # Download file again
                download_multistream(
                    file, filename, os.path.join("../multistream/compressed", filename)
                )
            else:
                print("[INFO] Valid matching multistream found in multistream folder")
        else:
            # Download file
            download_multistream(
                file, filename, os.path.join("../multistream/compressed", filename)
            )

        # Decompress the .bz2 file and move the resulting .xml file to the decompressed directory
        extract_dump(filename)

        # Delete file from '../multistream/compressed' folder
        os.remove(os.path.join("../multistream/compressed", filename))

    # Remove .bz2 sufix from archive filename
    # (ex) ptwiki-20240720-pages-articles-multistream.xml.bz2 -> ptwiki-20240720-pages-articles-multistream.xml
    return filename[:-4]


def delete_corrupted_xmls() -> None:
    """
    Remove any corrupted or incomplete XML files from the compressed directory.
    """

    for file in os.listdir("../multistream/compressed"):
        if file.endswith(".xml"):
            os.remove(os.path.join("../multistream/compressed/", file))


def extract_dump(filename: str) -> None:
    """
    Decompress the .bz2 file and move the resulting .xml file to the decompressed directory.

    Args:
        filename (str): Name of the .bz2 file to decompress.
    """

    print(f"[INFO] Extracting '{filename}' (this might take several minutes)")

    # Run 'bzip2 -dk ../multistream/compressed/...' to decompress the .bz2 file
    # https://superuser.com/a/480951
    subprocess.run(
        ["bzip2", "-dk", "../multistream/compressed/" + filename], check=True
    )

    # https://docs.python.org/3/library/os.html#os.replace
    # https://stackoverflow.com/a/8858026
    os.replace(
        os.path.join("../multistream/compressed/", filename[:-4]),
        os.path.join("../multistream/decompressed/", filename[:-4]),
    )


def select_dump() -> Tuple[str, str]:
    """
    Initialize directories, clean up corrupted files, and handle the selection, download, and extraction of the dump.

    Returns:
        Tuple[str, str]: A tuple containing the wiki name/date and the filename of the decompressed multistream file.
    """

    # Create necessary directories for compressed and decompressed multistream files
    create_multistream_dir()

    # Remove any corrupted or incomplete XML files to avoid processing errors
    delete_corrupted_xmls()

    # Request user's input for a wikiname/date reference
    wikinamedate = input("Fill with wikiname/date [ex: 'ptwiki/20240720']")
    print()

    # Main logic for verifying, downloading, and extracting the multistream dump file
    filename = get_multistream_file(wikinamedate)

    print(f"[INFO] '{wikinamedate}' successfully selected")
    print()

    return wikinamedate, filename

In [2]:
%%time
# Entry point: initialize directories, delete corrupted files, and handle download and extraction of the selected dump
wikinamedate, filename = select_dump()

# Save variable between different Jupyter notebooks
%store filename wikinamedate

Fill with wikiname/date [ex: 'ptwiki/20240720'] ptwiki/20240720



[INFO] Fetching dump info
[INFO] Downloading 'ptwiki-20240720-pages-articles-multistream.xml.bz2': 100%|██████████| 2.43G/2.43G [08:29<00:00, 4.77MB/s]
[INFO] Checking SHA1 Checksum of 'ptwiki-20240720-pages-articles-multistream.xml.bz2'
[INFO] Extracting 'ptwiki-20240720-pages-articles-multistream.xml.bz2' (this might take several minutes)
[INFO] 'ptwiki/20240720' successfully selected

Stored 'filename' (str)
Stored 'wikinamedate' (str)
CPU times: user 1min 23s, sys: 46.9 s, total: 2min 10s
Wall time: 23min 11s
