In [28]:
%pip install -q wikipedia-api

In [29]:
import wikipediaapi
import os

wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [37]:
def get_pages(page, memo, depth=1):
    pages = []
    for p in page.categorymembers.values():
        if p.title in memo:
            continue
        memo.add(page.title)
        if p.ns == wikipediaapi.Namespace.CATEGORY and depth >= 0:
            pages.extend(get_pages(p, memo, depth=depth-1))
        elif p.ns == wikipediaapi.Namespace.MAIN:
            pages.append(p)
    return pages

cat = wiki_wiki.page("Category:Algorithms")
memo = set()
pages = get_pages(cat, memo, 2)

In [38]:
pages = [p for p in pages if not p.title.startswith("List of")]
pages = [p for p in pages if not p.title.startswith("Lists of")]
pages = [p for p in pages if not p.title.startswith("Index of")]
# pages = [p for p in pages if "Medal" not in p.title]
# pages = [p for p in pages if "Lectures" not in p.title]
# pages = [p for p in pages if "Timeline" not in p.title]
len(pages), pages[1000:1000+20]

(14153,
 [Spatial ETL (id: ??, ns: 0),
  Spatial reference system (id: ??, ns: 0),
  SRTM Water Body Data (id: ??, ns: 0),
  Suitability analysis (id: ??, ns: 0),
  Suitability model (id: ??, ns: 0),
  SuperMap (id: ??, ns: 0),
  Terralink International (id: ??, ns: 0),
  Tiled web map (id: ??, ns: 0),
  Toponym resolution (id: ??, ns: 0),
  Traditional knowledge GIS (id: ??, ns: 0),
  The Truth About Crime (id: ??, ns: 0),
  United Nations Spatial Data Infrastructure (id: ??, ns: 0),
  Vector Map (id: ??, ns: 0),
  Vector overlay (id: ??, ns: 0),
  Viewshed analysis (id: ??, ns: 0),
  Visualizing Energy Resources Dynamically on the Earth (id: ??, ns: 0),
  Web Registry Service (id: ??, ns: 0),
  Wikimapia (id: ??, ns: 0),
  Intersection (Euclidean geometry) (id: ??, ns: 0),
  DE-9IM (id: ??, ns: 0)])

In [39]:
import shutil
dst_dir = "wiki_data"
shutil.rmtree(dst_dir, ignore_errors=True)
os.mkdir(dst_dir)

title2index = {}
for p in pages:
    if p.title not in title2index:
        title2index[p.title] = len(title2index)

with open(os.path.join(dst_dir, "nodes.txt"), 'w') as g:
    for i, name in title2index.items():
        g.write(f'{i}\t{name}\n')

In [44]:
from tqdm.notebook import tqdm
from time import sleep
from copy import deepcopy
import gc

def load_pages(nodes_file):
    titles = []
    with open(nodes_file) as f:
        for line in f:
            title, index = line.strip().split('\t')
            index = int(index)
            title2index[title] = index
            titles.append(line.split('\t')[0])
    pages = []
    for title in tqdm(titles, desc='load pages'):
        pages.append(wiki_wiki.page(title))
    return pages

def process_chunk(pages_list, chunk_num, dst_dir, chunk_size=5769):
    pages_to_process = pages_list[chunk_num*chunk_size:(chunk_num+1)*chunk_size]
    h = open(os.path.join(dst_dir, f"texts{chunk_num}.txt"), 'w')
    edges = set()
    for p in tqdm(pages_to_process, desc=f'process chunk {chunk_num}'):
        if p.title not in title2index:
            # print(f'WARNING: missing page {p.title}')
            continue
        for l in p.links.values():
            link_title = l.title
            if link_title not in title2index:
                continue
            try:
                edges.add((title2index[p.title], title2index[link_title]))
            except KeyError:
                pass
                # print(f'WARNING: key error {p.title}, {link_title}')
        h.write(f'= {p.title} =\n\n{p.text}\n\n')
        sleep(0.01)  # max 100 RPS
    h.close()

    with open(os.path.join(dst_dir, f"edges{chunk_num}.txt"), 'w') as f:
        for src, dst in edges:
            f.write(f'{src}\t{dst}\n')


In [45]:
title2index = {}
pages = load_pages("wiki_data/nodes.txt")

load pages:   0%|          | 0/11538 [00:00<?, ?it/s]

In [46]:
x = len(pages)

print("Divisors of", x, end=": ")
for i in range(2, x):
    if x % i == 0:
        print(i, end = " ")

Divisors of 11538: 2 3 6 9 18 641 1282 1923 3846 5769 

In [None]:
process_chunk(pages, 0, "wiki_data")
# process_chunk(pages, 1, "wiki_data")