## ⬇ Download dumps from knowledge graphs

### ⚙ Setting the required parameters

First, we define the locations where we should put the dumps to

In [1]:
from os.path import expanduser

import requests
from tqdm.auto import tqdm
import ipywidgets as widgets, datetime
from functools import partial
from kgdata.misc.download import WikidataDump, WikipediaDump, WGet
from pathlib import Path

WIKIDATA_DIR = expanduser("~/kgdata/wikidata")
WIKIPEDIA_DIR = expanduser("~/kgdata/wikipedia")

Then, we run the following cells to automatically fetch available dumps from the knowledge graphs. At the end, it allows us to set the `SELECTED_DUMP_DATE` variable to specify which dump we want to download.

In [None]:
wikidata = WikidataDump()
wikipedia = WikipediaDump()
with tqdm(desc="fetch information") as pbar:
    wikidata.fetch(pbar=pbar)
    wikipedia.fetch(pbar=pbar)

In [3]:
dates, explanation = wikidata.list_dates(wikipedia)
selected_date = dates[0]

control = []
output = widgets.Output()

defaultbtn = widgets.Button()
selectedbtn = widgets.Button()
selectedbtn.style.button_color = 'lightblue'  # type: ignore

def click(btn, date):
    global selected_date
    selected_date = date

    btn.style = selectedbtn.style
    for obtn in control:
        if obtn is not btn:
            obtn.style = defaultbtn.style
    with output:
        output.clear_output()
        print("Date:", date.isoformat(), "Score:", explanation[date]['total'])
        print(f"  + Self Score:", explanation[date]['self']['score'])
        for name, mostsimilar in explanation[date]['self']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")
        print(f"  + Other Score:", explanation[date]['others']['score'])
        for name, mostsimilar in explanation[date]['others']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")

for date in dates:
    btn = widgets.Button(description=date.isoformat())
    btn.on_click(partial(click, date=date))
    control.append(btn)
display(widgets.HBox(control))
display(output)

click(control[0], dates[0])

HBox(children=(Button(description='2024-01-01', style=ButtonStyle()), Button(description='2023-12-18', style=B…

Output()

In [3]:
# set the selected date from the selected button, or you can manually set it to the date you want
# SELECTED_DUMP_DATE = selected_date
SELECTED_DUMP_DATE = datetime.date(2024, 1, 1)
print("Selected date:", SELECTED_DUMP_DATE.isoformat())

Selected date: 2024-01-01


### 🗄 Wikidata

List the files that we are going to download

In [4]:
wd_files = wikidata.list_files(SELECTED_DUMP_DATE)
wd_files

[DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/entities/20240101/wikidata-20240101-all.json.bz2'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz')]

In [5]:
wdjobs = []

In [7]:
wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)
wdjobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

### 🗄 Wikidata XML Dumps

In [2]:
resp = requests.get("https://dumps.wikimedia.org/wikidatawiki/20240220/dumpstatus.json")

In [3]:
files = resp.json()["jobs"]["articlesmultistreamdump"]["files"]

In [4]:
size = sum([x["size"] for x in files.values()])

In [5]:
size / 153415825959

1.0024655324353635

In [6]:
from pathlib import Path

In [7]:
jobs = [
    (
        "https://dumps.wikimedia.org" + obj["url"],
        Path("/nas/home/binhvu/kgdata/wikidata/20240220/000_dumps/pages-articles")
        / key,
    )
    for key, obj in sorted(files.items(), key=lambda x: x[1]["size"])
]

In [8]:
len(jobs)

182

### 🗄 Wikipedia

In [10]:
wp_files = wikipedia.list_files(SELECTED_DUMP_DATE)
wp_files

[DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/other/enterprise_html/runs/20230620/enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz')]

In [11]:
wpjobs = wikipedia.create_download_jobs(wp_files, WIKIPEDIA_DIR)

### 💾 Download the data

In [9]:
jobs = []
if "wdjobs" in locals():
    jobs += wdjobs
if "wpjobs" in locals():
    jobs += wpjobs
jobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

In [9]:
b = 3
for i in range(0, len(jobs), b):
    with WGet.start() as wget:
        for url, outfile in jobs[i : i + b]:
            wget.download(url, outfile)
        wget.monitor()

[32m2024-06-27 06:59:16.974[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index6.txt-p5969005p6052571.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 06:59:17.607[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index7.txt-p7552572p7838096.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index7.txt-p7552572p7838096.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 06:59:18.343[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index26.txt-p87798894p88185873.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p87798894p88185873.bz2: 0.00B [00:00, ?B…

[32m2024-06-27 06:59:19.125[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index8.txt-p10838097p11495800.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index8.txt-p10838097p11495800.bz2: 0.00B [00:00, ?B/…

[32m2024-06-27 06:59:19.984[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index1.txt-p1p441397.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index1.txt-p1p441397.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index15.txt-p26524141p27066455.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index22.txt-p55190486p55730032.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index21.txt-p50027041p50690485.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index17.txt-p31078206p31710876.bz2: 0.00B [00:00, ?B…

[32m2024-06-27 06:59:24.266[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index2.txt-p441398p1114931.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index2.txt-p441398p1114931.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p118185874p118772840.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index14.txt-p24336127p25024140.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index10.txt-p15498368p16236979.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index12.txt-p19170285p19883157.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index20.txt-p44722796p45527040.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index8.txt-p9338097p10838096.bz2: 0.00B [00:00, ?B/s…

Download wikidatawiki-20240220-pages-articles-multistream-index24.txt-p64692699p65585258.bz2: 0.00B [00:00, ?B…

[32m2024-06-27 06:59:32.340[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index3.txt-p1114932p2049641.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index3.txt-p1114932p2049641.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index9.txt-p12995801p13998367.bz2: 0.00B [00:00, ?B/…

Download wikidatawiki-20240220-pages-articles-multistream-index16.txt-p28566456p29578205.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index19.txt-p39053268p40222795.bz2: 0.00B [00:00, ?B…

[32m2024-06-27 06:59:36.807[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream-index4.txt-p2049642p3098446.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream-index4.txt-p2049642p3098446.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p74585259p75798893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index18.txt-p34710877p36053267.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index10.txt-p13998368p15498367.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index20.txt-p40222796p41722795.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p68585259p70085258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p77298894p78798893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index19.txt-p37553268p39053267.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p70085259p71585258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index21.txt-p48527041p50027040.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index23.txt-p58730033p60192698.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p71585259p73085258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p80298894p81798893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index21.txt-p45527041p47027040.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index19.txt-p36053268p37553267.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p75798894p77298893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p83298894p84798893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index15.txt-p25024141p26524140.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index20.txt-p41722796p43222795.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p89685874p91185873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p84798894p86298893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p81798894p83298893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index23.txt-p57230033p58730032.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index22.txt-p50690486p52190485.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index21.txt-p47027041p48527040.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index16.txt-p27066456p28566455.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p73085259p74585258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index8.txt-p7838097p9338096.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p86298894p87798893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p88185874p89685873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index24.txt-p60192699p61692698.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p91185874p92685873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index11.txt-p16236980p17670284.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index26.txt-p78798894p80298893.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index17.txt-p29578206p31078205.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p113685874p115185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index24.txt-p61692699p63192698.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index5.txt-p3098447p4469004.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index24.txt-p63192699p64692698.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index18.txt-p33210877p34710876.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p67085259p68585258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index20.txt-p43222796p44722795.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index7.txt-p6052572p7552571.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p115185874p116685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p112185874p113685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index22.txt-p52190486p53690485.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p92685874p94185873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p97185874p98685873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index23.txt-p55730033p57230032.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index18.txt-p31710877p33210876.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index25.txt-p65585259p67085258.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index9.txt-p11495801p12995800.bz2: 0.00B [00:00, ?B/…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p94185874p95685873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index14.txt-p22836127p24336126.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p95685874p97185873.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index6.txt-p4469005p5969004.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream-index13.txt-p19883158p21383157.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index12.txt-p17670285p19170284.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p110685874p112185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p107685874p109185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p106185874p107685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p116685874p118185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p100185874p101685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p103185874p104685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p104685874p106185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p98685874p100185873.bz2: 0.00B [00:00, ?…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p101685874p103185873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index13.txt-p21383158p22836126.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream-index27.txt-p109185874p110685873.bz2: 0.00B [00:00, …

Download wikidatawiki-20240220-pages-articles-multistream-index22.txt-p53690486p55190485.bz2: 0.00B [00:00, ?B…

Download wikidatawiki-20240220-pages-articles-multistream8.xml-p10838097p11495800.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream6.xml-p5969005p6052571.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream7.xml-p7552572p7838096.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream8.xml-p9338097p10838096.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p118185874p118772840.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream14.xml-p24336127p25024140.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream24.xml-p64692699p65585258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream10.xml-p15498368p16236979.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p87798894p88185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream9.xml-p12995801p13998367.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream12.xml-p19170285p19883157.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream15.xml-p26524141p27066455.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream17.xml-p31078206p31710876.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream21.xml-p50027041p50690485.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream10.xml-p13998368p15498367.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream16.xml-p27066456p28566455.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream22.xml-p55190486p55730032.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream13.xml-p21383158p22836126.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p110685874p112185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream18.xml-p33210877p34710876.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream13.xml-p19883158p21383157.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream8.xml-p7838097p9338096.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p103185874p104685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p112185874p113685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p116685874p118185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p113685874p115185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p115185874p116685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p101685874p103185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p74585259p75798893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream12.xml-p17670285p19170284.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p109185874p110685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream9.xml-p11495801p12995800.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p100185874p101685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream14.xml-p22836127p24336126.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p107685874p109185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream24.xml-p61692699p63192698.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p98685874p100185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p106185874p107685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream24.xml-p60192699p61692698.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p97185874p98685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p104685874p106185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p94185874p95685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream11.xml-p16236980p17670284.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p95685874p97185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p65585259p67085258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream7.xml-p6052572p7552571.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream15.xml-p25024141p26524140.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream20.xml-p44722796p45527040.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p92685874p94185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream16.xml-p28566456p29578205.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p68585259p70085258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream24.xml-p63192699p64692698.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream23.xml-p58730033p60192698.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream6.xml-p4469005p5969004.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p73085259p74585258.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 08:13:29.770[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream4.xml-p2049642p3098446.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream4.xml-p2049642p3098446.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p67085259p68585258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream5.xml-p3098447p4469004.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p81798894p83298893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p84798894p86298893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p70085259p71585258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p80298894p81798893.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 08:34:15.934[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream3.xml-p1114932p2049641.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream3.xml-p1114932p2049641.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p78798894p80298893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p83298894p84798893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p75798894p77298893.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 08:42:31.903[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream1.xml-p1p441397.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream1.xml-p1p441397.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream25.xml-p71585259p73085258.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p77298894p78798893.bz2: 0.00B [00:00, ?B/s]

[32m2024-06-27 08:50:31.629[0m | [1mINFO    [0m | [36mkgdata.misc.download[0m:[36mdownload[0m:[36m498[0m - [1mFile wikidatawiki-20240220-pages-articles-multistream2.xml-p441398p1114931.bz2 exists but not marked as success. Redownload it[0m


Download wikidatawiki-20240220-pages-articles-multistream2.xml-p441398p1114931.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream18.xml-p31710877p33210876.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream17.xml-p29578206p31078205.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream23.xml-p55730033p57230032.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream23.xml-p57230033p58730032.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream26.xml-p86298894p87798893.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p89685874p91185873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream21.xml-p45527041p47027040.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream22.xml-p50690486p52190485.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p88185874p89685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream22.xml-p53690486p55190485.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream27.xml-p91185874p92685873.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream22.xml-p52190486p53690485.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream20.xml-p43222796p44722795.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream19.xml-p39053268p40222795.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream21.xml-p48527041p50027040.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream21.xml-p47027041p48527040.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream20.xml-p41722796p43222795.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream20.xml-p40222796p41722795.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream19.xml-p36053268p37553267.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream18.xml-p34710877p36053267.bz2: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240220-pages-articles-multistream19.xml-p37553268p39053267.bz2: 0.00B [00:00, ?B/s]

convert bz2 to zst in parallel for faster decompressing

In [None]:
lbzip2 -cd wikidata-20240101-all.json.bz2 | zstd -9 -o wikidata-20240101-all.json.zst

or you can split the big file by lines directly

lbzip2 -cd ../../000_dumps/wikidata-20220521-truthy.nt.bz2 | split -d -l1000000 --suffix-length 5 --filter 'zstd -q -6 -o $FILE.zst' - part-