## ⬇ Download dumps from knowledge graphs

### ⚙ Setting the required parameters

First, we define the locations where we should put the dumps to

In [1]:
from os.path import expanduser

import requests
from tqdm.auto import tqdm
import ipywidgets as widgets, datetime
from functools import partial
from kgdata.misc.download import WikidataDump, WikipediaDump, WGet
from pathlib import Path

WIKIDATA_DIR = expanduser("~/kgdata/wikidata")
WIKIPEDIA_DIR = expanduser("~/kgdata/wikipedia")

Then, we run the following cells to automatically fetch available dumps from the knowledge graphs. At the end, it allows us to set the `SELECTED_DUMP_DATE` variable to specify which dump we want to download.

In [2]:
wikidata = WikidataDump()
wikipedia = WikipediaDump()
with tqdm(desc="fetch information") as pbar:
    wikidata.fetch(pbar=pbar)
    wikipedia.fetch(pbar=pbar)

fetch information: 0it [00:00, ?it/s]

In [3]:
dates, explanation = wikidata.list_dates(wikipedia)
selected_date = dates[0]

control = []
output = widgets.Output()

defaultbtn = widgets.Button()
selectedbtn = widgets.Button()
selectedbtn.style.button_color = 'lightblue'  # type: ignore

def click(btn, date):
    global selected_date
    selected_date = date

    btn.style = selectedbtn.style
    for obtn in control:
        if obtn is not btn:
            obtn.style = defaultbtn.style
    with output:
        output.clear_output()
        print("Date:", date.isoformat(), "Score:", explanation[date]['total'])
        print(f"  + Self Score:", explanation[date]['self']['score'])
        for name, mostsimilar in explanation[date]['self']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")
        print(f"  + Other Score:", explanation[date]['others']['score'])
        for name, mostsimilar in explanation[date]['others']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")

for date in dates:
    btn = widgets.Button(description=date.isoformat())
    btn.on_click(partial(click, date=date))
    control.append(btn)
display(widgets.HBox(control))
display(output)

click(control[0], dates[0])

HBox(children=(Button(description='2024-01-01', style=ButtonStyle()), Button(description='2023-12-18', style=B…

Output()

In [3]:
# set the selected date from the selected button, or you can manually set it to the date you want
# SELECTED_DUMP_DATE = selected_date
SELECTED_DUMP_DATE = datetime.date(2024, 1, 1)
print("Selected date:", SELECTED_DUMP_DATE.isoformat())

Selected date: 2024-01-01


### 🗄 Wikidata

List the files that we are going to download

In [4]:
wd_files = wikidata.list_files(SELECTED_DUMP_DATE)
wd_files

[DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/entities/20240101/wikidata-20240101-all.json.bz2'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz')]

In [5]:
wdjobs = []

In [7]:
wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)
wdjobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

### 🗄 Wikidata XML Dumps

In [3]:
resp = requests.get('https://dumps.wikimedia.org/wikidatawiki/20240220/dumpstatus.json')

In [4]:
files = resp.json()['jobs']['articlesmultistreamdump']['files']

In [5]:
size = sum([x['size'] for x in files.values()])

In [6]:
size / 153415825959

1.0024655324353635

In [7]:
from pathlib import Path

In [8]:
jobs = [
    (
        'https://dumps.wikimedia.org' + obj['url'],
        Path('/nas/home/binhvu/kgdata/wikidata/20240220/000_dumps/pages-articles') / key
    )
    for key, obj in sorted(files.items(), key=lambda x: x[1]['size'])
]

In [9]:
len(jobs)

182

In [12]:
jobs[0]

('https://dumps.wikimedia.org/wikidatawiki/20240220/wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2',
 PosixPath('/nas/home/binhvu/kgdata/wikidata/20240220/000_dumps/pages-articles/wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2'))

### 🗄 Wikipedia

In [10]:
wp_files = wikipedia.list_files(SELECTED_DUMP_DATE)
wp_files

[DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/other/enterprise_html/runs/20230620/enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz')]

In [11]:
wpjobs = wikipedia.create_download_jobs(wp_files, WIKIPEDIA_DIR)

### 💾 Download the data

In [9]:
jobs = []
if "wdjobs" in locals():
    jobs += wdjobs
if "wpjobs" in locals():
    jobs += wpjobs
jobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

In [10]:
b = 3
for i in range(0, len(jobs), b):
    with WGet.start() as wget:
        for url, outfile in jobs[i:i+b]:
            wget.download(url, outfile)
        wget.monitor()

[32m2024-06-27 06:18:15.942[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m409[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2 exists but not marked as success. Redownload it[0m


download


Download wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 06:18:16.559[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m409[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index7.txt-p7552572p7838096.bz2 exists but not marked as success. Redownload it[0m


download


Download wikidatawiki-20240220-pages-articles-multistream-index7.txt-p7552572p7838096.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 06:18:17.315[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m409[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index26.txt-p87798894p88185873.bz2 exists but not marked as success. Redownload it[0m


download


Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p87798894p88185873.bz2: 0.00B [00:00, ?B…

[32m2024-06-27 06:18:18.062[0m | [31m[1mERROR   [0m | [36mkgdata.misc.download[0m:[36mget_download_progress[0m:[36m514[0m - [31m[1m[PID=34897] Error while downloading URL: https://dumps.wikimedia.org/wikidatawiki/20240220/wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2.
Reason: Can't determine the file size.
Output:
--2024-06-27 06:18:15--  https://dumps.wikimedia.org/wikidatawiki/20240220/wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 208.80.154.71, 2620:0:861:3:208:80:154:71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|208.80.154.71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 279553 (273K) [application/octet-stream]
Saving to: ‘/nas/home/binhvu/kgdata/wikidata/20240220/000_dumps/pages-articles/wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2’
     0K .......... .......... .......

RuntimeError: Failed to determine the download progress

convert bz2 to zst in parallel for faster decompressing

In [None]:
lbzip2 -cd wikidata-20240101-all.json.bz2 | zstd -9 -o wikidata-20240101-all.json.zst

or you can split the big file by lines directly

lbzip2 -cd ../../000_dumps/wikidata-20220521-truthy.nt.bz2 | split -d -l1000000 --suffix-length 5 --filter 'zstd -q -6 -o $FILE.zst' - part-