In [None]:
import time
import requests
from pathlib import Path
from typing import List, Tuple
import pycountry

API_URL_TMPL = "https://{lang}.wikipedia.org/w/api.php"
SLEEP        = 0.05   # small delay between requests
BASE_DIR     = Path.home() / "Desktop"
OUTPUT_ROOT  = BASE_DIR / "History_of_ideologies"
MAX_DEPTH    = 4

visited = set()

def sanitize(name: str) -> str:
    return "".join(c if c.isalnum() or c in " _-()" else "_" for c in name)

def get_english_name(lang_code: str) -> str:
    try:
        lang = pycountry.languages.get(alpha_2=lang_code) or pycountry.languages.get(alpha_3=lang_code)
        return lang.name if lang and hasattr(lang, 'name') else lang_code
    except:
        return lang_code

def query_api(lang: str, params: dict, retries: int = 3) -> dict:
    url = API_URL_TMPL.format(lang=lang)
    headers = {
        "User-Agent": "WikipediaScraper/1.0 (njian29@emory.edu)"
    }
    for attempt in range(retries):
        try:
            r = requests.get(url, params=params, headers=headers, timeout=30)
            r.raise_for_status()
            return r.json()
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(2 ** attempt)  # exponential backoff
                continue
            raise

def get_category_members(lang: str, category: str, cmtype: str="page") -> List[str]:
    members, cont = [], None
    while True:
        params = {
            "action": "query", "format": "json", "list": "categorymembers",
            "cmtitle": f"Category:{category}", "cmtype": cmtype,
            "cmlimit": "500", "cmnamespace": "14" if cmtype=="subcat" else "0"
        }
        if cont:
            params["cmcontinue"] = cont
        data = query_api(lang, params)
        items = data["query"]["categorymembers"]
        if cmtype == "subcat":
            members += [it["title"].split("Category:",1)[-1] for it in items]
        else:
            members += [it["title"] for it in items]
        cont = data.get("continue", {}).get("cmcontinue")
        if not cont: break
    return members

def get_category_langlinks(lang: str, category: str) -> List[Tuple[str, str]]:
    params = {"action": "query", "format": "json", "prop": "langlinks",
              "titles": f"Category:{category}", "lllimit": "max"}
    data = query_api(lang, params)
    page = next(iter(data["query"]["pages"].values()))
    return [(ll["lang"], ll["*"]) for ll in page.get("langlinks", [])]

def get_langlinks(lang: str, title: str) -> List[Tuple[str, str]]:
    params = {"action": "query", "format": "json", "prop": "langlinks",
              "titles": title, "lllimit": "max"}
    data = query_api(lang, params)
    page = next(iter(data["query"]["pages"].values()))
    return [(ll["lang"], ll["*"]) for ll in page.get("langlinks", [])]

def fetch_extract(lang: str, title: str) -> str:
    params = {"action": "query", "format": "json", "prop": "extracts",
              "explaintext": True, "titles": title}
    data = query_api(lang, params)
    page = next(iter(data["query"]["pages"].values()))
    return page.get("extract", "")

def save_entry(title: str, lang: str, out_dir: Path):
    variants = [(lang, title)] + get_langlinks(lang, title)
    entry_dir = out_dir / sanitize(title)
    entry_dir.mkdir(parents=True, exist_ok=True)

    for lg2, loc_title in variants:
        key = (lg2, loc_title)
        if key in visited:
            continue
        visited.add(key)
        text = fetch_extract(lg2, loc_title)
        if not text.strip():
            continue
        fname = sanitize(get_english_name(lg2)) + ".txt"
        (entry_dir / fname).write_text(text, encoding="utf-8")
        print(f"✔ Saved: {loc_title} [{lg2}]")

def scrape_category(lang: str, category: str, out_dir: Path, depth: int=1):
    print(f"{'  '*(depth-1)}→ depth={depth}, scraping Category:{category} [{lang}]")

    for title in get_category_members(lang, category, cmtype="page"):
        save_entry(title, lang, out_dir)
        time.sleep(SLEEP)

    if depth < MAX_DEPTH:
        for subcat in get_category_members(lang, category, cmtype="subcat"):
            scrape_category(lang, subcat, out_dir / sanitize(subcat), depth+1)

def main():
    OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)

    root_category = "History_of_ideologies"
    langlinks = get_category_langlinks("en", root_category)
    langs = [("en", root_category)] + langlinks

    print("Will scrape these category entries:")
    for lg, ttl in langs:
        print(f" - [{lg}] Category:{ttl}")

    for lg, ttl in langs:
        out_dir = OUTPUT_ROOT / lg / sanitize(ttl)
        scrape_category(lg, ttl, out_dir, depth=1)

    print(f"\n✅ All results saved to: {OUTPUT_ROOT}")

if __name__ == "__main__":
    main()

Will scrape these category entries:
 - [en] Category:History_of_ideologies
 - [ab] Category:Акатегориа:Аидеологиақәа рҭоурых
 - [ar] Category:تصنيف:تاريخ العقائد
 - [az] Category:Kateqoriya:İdeologiyalar tarixi
 - [be-tarask] Category:Катэгорыя:Гісторыя ідэалёгіяў
 - [bn] Category:বিষয়শ্রেণী:মতাদর্শের ইতিহাস
 - [bs] Category:Kategorija:Historija ideologija
 - [el] Category:Κατηγορία:Ιστορία των ιδεολογιών
 - [eu] Category:Kategoria:Ideologien historia
 - [fa] Category:رده:تاریخ ایدئولوژی
 - [fi] Category:Luokka:Aatehistoria
 - [gl] Category:Categoría:Historia das ideoloxías
 - [hr] Category:Kategorija:Povijest ideologija
 - [id] Category:Kategori:Sejarah ideologi
 - [io] Category:Kategorio:Historio di ideologii
 - [ka] Category:კატეგორია:იდეოლოგიების ისტორია
 - [ru] Category:Категория:История идеологий
 - [sr] Category:Категорија:Историја идеологија
 - [vi] Category:Thể loại:Lịch sử ý thức hệ
 - [zh] Category:Category:意识形态史
→ depth=1, scraping Category:History_of_ideologies [en]
✔ Sav

KeyboardInterrupt: 