In [32]:
%pip install -q wikipedia-api

In [33]:
import wikipediaapi
import os

wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [34]:
def get_pages(page, memo, depth=1):
    pages = []
    for p in page.categorymembers.values():
        if p.title in memo:
            continue
        memo.add(page.title)
        if p.ns == wikipediaapi.Namespace.CATEGORY and depth >= 0:
            pages.extend(get_pages(p, memo, depth=depth-1))
        elif p.ns == wikipediaapi.Namespace.MAIN:
            pages.append(p)
    return pages

categories = [ wiki_wiki.page("Category:"+category) for category in
              [
               "Cryptographic_algorithms",
               "Graph_algorithms",
               "Algorithms_on_strings"
               ]
              ]

memo = set()
pages = []
for cat in categories:
    pages.extend(get_pages(cat, memo, 2))

In [35]:
pages = [p for p in pages if not p.title.startswith("List of")]
pages = [p for p in pages if not p.title.startswith("Lists of")]
pages = [p for p in pages if not p.title.startswith("Index of")]
# pages = [p for p in pages if "Medal" not in p.title]
# pages = [p for p in pages if "Lectures" not in p.title]
# pages = [p for p in pages if "Timeline" not in p.title]
len(pages), pages[:10]

(1198,
 [Bach's algorithm (id: ??, ns: 0),
  BB84 (id: ??, ns: 0),
  Beaufort cipher (id: ??, ns: 0),
  Block cipher mode of operation (id: ??, ns: 0),
  CDMF (id: ??, ns: 0),
  Ciphertext stealing (id: ??, ns: 0),
  Common Scrambling Algorithm (id: ??, ns: 0),
  CryptGenRandom (id: ??, ns: 0),
  Crypto++ (id: ??, ns: 0),
  Cryptographic agility (id: ??, ns: 0)])

In [36]:
import shutil
dst_dir = "wiki_data"
shutil.rmtree(dst_dir, ignore_errors=True)
os.mkdir(dst_dir)

title2index = {}
for p in pages:
    if p.title not in title2index:
        title2index[p.title] = len(title2index)

with open(os.path.join(dst_dir, "nodes.txt"), 'w') as g:
    for i, name in title2index.items():
        g.write(f'{i}\t{name}\n')

In [38]:
counter = 0

In [43]:
from tqdm.notebook import tqdm
from time import sleep
from copy import deepcopy
import gc

def load_pages(nodes_file):
    titles = []
    with open(nodes_file) as f:
        for line in f:
            title, index = line.strip().split('\t')
            index = int(index)
            title2index[title] = index
            titles.append(line.split('\t')[0])
    pages = []
    for title in tqdm(titles, desc='load pages'):
        pages.append(wiki_wiki.page(title))
    return pages

def process_pages(pages_list, dst_dir):
    edges = set()

    for p in tqdm(pages_list):
        global counter 
        if p.title not in title2index:
            continue
        if p.title not in title2index:
            # print(f'WARNING: missing page {p.title}')
            continue
        for l in p.links.values():
            link_title = l.title
            if link_title not in title2index:
                continue
            try:
                edges.add((title2index[p.title], title2index[link_title]))
            except KeyError:
                pass
                # print(f'WARNING: key error {p.title}, {link_title}')
        h = open(os.path.join(dst_dir, f"texts{counter}.txt"), 'w')   
        counter += 1
        h.write(f'= {p.title} =\n\n{p.text}\n\n')
        sleep(0.01)  # max 100 RPS
        h.close()

    with open(os.path.join(dst_dir, f"edges.txt"), 'w') as f:
        for src, dst in edges:
            f.write(f'{src}\t{dst}\n')


In [44]:
title2index = {}
pages = load_pages("wiki_data/nodes.txt")

load pages:   0%|          | 0/1046 [00:00<?, ?it/s]

In [45]:
len(pages)

1046

In [46]:
process_pages(pages, "wiki_data")

  0%|          | 0/1046 [00:00<?, ?it/s]

In [48]:
! zip -r wiki_data.zip wiki_data

  adding: wiki_data/ (stored 0%)
  adding: wiki_data/texts390.txt (deflated 76%)
  adding: wiki_data/texts975.txt (deflated 64%)
  adding: wiki_data/texts60.txt (deflated 73%)
  adding: wiki_data/texts774.txt (deflated 56%)
  adding: wiki_data/texts373.txt (deflated 73%)
  adding: wiki_data/texts728.txt (deflated 52%)
  adding: wiki_data/texts852.txt (deflated 77%)
  adding: wiki_data/texts418.txt (deflated 71%)
  adding: wiki_data/texts634.txt (deflated 63%)
  adding: wiki_data/texts311.txt (deflated 82%)
  adding: wiki_data/texts1024.txt (deflated 62%)
  adding: wiki_data/texts858.txt (deflated 61%)
  adding: wiki_data/texts4.txt (deflated 71%)
  adding: wiki_data/texts86.txt (deflated 64%)
  adding: wiki_data/texts775.txt (deflated 51%)
  adding: wiki_data/texts869.txt (deflated 64%)
  adding: wiki_data/texts613.txt (deflated 68%)
  adding: wiki_data/texts849.txt (deflated 62%)
  adding: wiki_data/texts442.txt (deflated 49%)
  adding: wiki_data/texts871.txt (deflated 58%)
  adding: 