## ⬇ Download dumps from knowledge graphs

### ⚙ Setting the required parameters

First, we define the locations where we should put the dumps to

In [1]:
from os.path import expanduser

WIKIDATA_DIR = expanduser("~/kgdata/wikidata")
WIKIPEDIA_DIR = expanduser("~/kgdata/wikipedia")

Then, we run the following cells to automatically fetch available dumps from the knowledge graphs. At the end, it allows us to set the `SELECTED_DUMP_DATE` variable to specify which dump we want to download.

In [2]:
from tqdm.auto import tqdm
import ipywidgets as widgets, datetime
from functools import partial
from kgdata.misc.download import WikidataDump, WikipediaDump, WGet

wikidata = WikidataDump()
wikipedia = WikipediaDump()
with tqdm(desc="fetch information") as pbar:
    wikidata.fetch(pbar=pbar)
    wikipedia.fetch(pbar=pbar)

fetch information: 0it [00:00, ?it/s]

In [3]:
dates, explanation = wikidata.list_dates(wikipedia)
selected_date = dates[0]

control = []
output = widgets.Output()

defaultbtn = widgets.Button()
selectedbtn = widgets.Button()
selectedbtn.style.button_color = 'lightblue'  # type: ignore

def click(btn, date):
    global selected_date
    selected_date = date

    btn.style = selectedbtn.style
    for obtn in control:
        if obtn is not btn:
            obtn.style = defaultbtn.style
    with output:
        output.clear_output()
        print("Date:", date.isoformat(), "Score:", explanation[date]['total'])
        print(f"  + Self Score:", explanation[date]['self']['score'])
        for name, mostsimilar in explanation[date]['self']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")
        print(f"  + Other Score:", explanation[date]['others']['score'])
        for name, mostsimilar in explanation[date]['others']['explanation'].items():
            print(f"    - {name}: {mostsimilar}")

for date in dates:
    btn = widgets.Button(description=date.isoformat())
    btn.on_click(partial(click, date=date))
    control.append(btn)
display(widgets.HBox(control))
display(output)

click(control[0], dates[0])

HBox(children=(Button(description='2024-01-01', style=ButtonStyle()), Button(description='2023-12-18', style=B…

Output()

In [3]:
# set the selected date from the selected button, or you can manually set it to the date you want
# SELECTED_DUMP_DATE = selected_date
SELECTED_DUMP_DATE = datetime.date(2024, 1, 1)
print("Selected date:", SELECTED_DUMP_DATE.isoformat())

Selected date: 2024-01-01


### 🗄 Wikidata

List the files that we are going to download

In [4]:
wd_files = wikidata.list_files(SELECTED_DUMP_DATE)
wd_files

[DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/entities/20240101/wikidata-20240101-all.json.bz2'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz'),
 DumpFile(date=datetime.date(2024, 1, 1), url='https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz')]

In [5]:
wdjobs = []

In [7]:
wdjobs = wikidata.create_download_jobs(wd_files, WIKIDATA_DIR)
wdjobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

### 🗄 Wikipedia

In [10]:
wp_files = wikipedia.list_files(SELECTED_DUMP_DATE)
wp_files

[DumpFile(date=datetime.date(2023, 6, 20), url='https://dumps.wikimedia.org/other/enterprise_html/runs/20230620/enwiki-NS0-20230620-ENTERPRISE-HTML.json.tar.gz')]

In [11]:
wpjobs = wikipedia.create_download_jobs(wp_files, WIKIPEDIA_DIR)

### 💾 Download the data

In [9]:
jobs = []
if "wdjobs" in locals():
    jobs += wdjobs
if "wpjobs" in locals():
    jobs += wpjobs
jobs

[('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-page.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-page.sql.gz')),
 ('https://dumps.wikimedia.org/wikidatawiki/20240101/wikidatawiki-20240101-redirect.sql.gz',
  PosixPath('/nas/home/binhvu/kgdata/wikidata/20240101/dumps/wikidatawiki-20240101-redirect.sql.gz'))]

In [10]:
with WGet.start() as wget:
    for url, outfile in jobs:
        wget.download(url, outfile)
    wget.monitor()

Download wikidatawiki-20240101-page.sql.gz: 0.00B [00:00, ?B/s]

Download wikidatawiki-20240101-redirect.sql.gz: 0.00B [00:00, ?B/s]

convert bz2 to zst in parallel for faster decompressing

In [None]:
lbzip2 -cd wikidata-20240101-all.json.bz2 | zstd -9 -o wikidata-20240101-all.json.zst