In [None]:
import os
import time
import json
import urllib.parse
import urllib.request

In [2]:
baseurl = "https://en.wikipedia.org/w/api.php"
user_agent = "PremierLeagueManagerDownloader/2.0"

In [3]:
def get_category_members(category_title, depth=1, exclude_subcats=None):
    """
    Fetch all page titles in a Wikipedia category.
    Automatically skips unwanted subcategories in exclude_subcats.
    """
    if exclude_subcats is None:
        exclude_subcats = []

    members = []
    cmcontinue = ""

    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category_title}",
            "cmlimit": "max",
            "format": "json"
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        query = baseurl + "?" + urllib.parse.urlencode(params)
        req = urllib.request.Request(query, headers={"User-Agent": user_agent})
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode("utf-8"))

        pages = data.get("query", {}).get("categorymembers", [])
        
        for p in pages:
            ns = p["ns"]
            title = p["title"]

            # Skip unwanted subcategories
            if ns == 14:  # category namespace
                clean = title.replace("Category:", "")
                if clean in exclude_subcats:
                    print(f" kipping excluded subcategory: {title}")
                    continue

                if depth > 0:
                    print(f"Exploring subcategory: {title}")
                    members += get_category_members(clean, depth - 1, exclude_subcats)
                continue

            # Add the page
            if ns == 0:  
                members.append(title)

        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.3)
        else:
            break

    return members


# Fetch the wikitext of a Wikipedia page given its title
def fetch_wikitext(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "content",
        "titles": title,
        "format": "json",
    }
    query = baseurl + "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(query, headers={"User-Agent": user_agent})

    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))

    page = next(iter(data["query"]["pages"].values()))
    try:
        return page["revisions"][0]["slots"]["main"]["*"]
    except Exception:
        return None


# Save a page's wikitext to a file
def save_page(title, folder):
    text = fetch_wikitext(title)
    if not text:
        print(f"No text found for {title}. Skipping.")
        return

    safe = title.replace("/", "_").replace(":", "_")
    path = os.path.join(folder, safe + ".txt")

    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"Saved: {title}")


In [4]:
save_dir = "managers_test"
os.makedirs(save_dir, exist_ok=True)

EXCLUDE = ["Lists of Premier League managers"]

print("Fetching Premier League Managers...")
managers = get_category_members(
    "Premier League managers",
    depth=1,
    exclude_subcats=EXCLUDE
)

print(f"Found {len(managers)} manager pages")

for m in managers:
    save_page(m, save_dir)
    time.sleep(0.5)

Fetching Premier League Managers...
 kipping excluded subcategory: Category:Lists of Premier League managers
Found 287 manager pages
Saved: Micky Adams
Saved: Neil Adams (footballer)
Saved: Tony Adams
Saved: Nigel Adkins
Saved: Dick Advocaat
Saved: Steve Agnew
Saved: Sam Allardyce
Saved: Clive Allen
Saved: Ruben Amorim
Saved: Carlo Ancelotti
Saved: Keith Andrews (footballer)
Saved: Michael Appleton
Saved: Osvaldo Ardiles
Saved: Mikel Arteta
Saved: Ron Atkinson
Saved: Alan Ball Jr.
Saved: Kevin Ball
Saved: Frank Barlow (footballer)
Saved: Dave Bassett
Saved: Rafael Benítez
Saved: Marcelo Bielsa
Saved: Slaven Bilić
Saved: Eric Black
Saved: Frank de Boer
Saved: Billy Bonds
Saved: Tony Book
Saved: Aidy Boothroyd
Saved: Mark Bowen (footballer)
Saved: Bob Bradley
Saved: Ian Branfoot
Saved: Leon Britton
Saved: Trevor Brooking
Saved: Phil Brown (footballer, born 1959)
Saved: Steve Bruce
Saved: George Burley
Saved: Frank Burrows
Saved: Terry Burton
Saved: Michael Carrick
Saved: Carlos Carvalhal

In [5]:
football_pages = [
    "Glossary of association football terms",
    "Association football tactics and skills",
    "Formation (association football)",
    "Association football positions",
    "Association football",
]

football_save_dir = "football_pages_test"
os.makedirs(football_save_dir, exist_ok=True)

print("\nFetching Football Topic Pages...")

for title in football_pages:
    save_page(title, football_save_dir)
    time.sleep(0.4)


Fetching Football Topic Pages...
Saved: Glossary of association football terms
Saved: Association football tactics and skills
Saved: Formation (association football)
Saved: Association football positions
Saved: Association football


In [6]:
terminology_save_dir = "football_terminology_test"
os.makedirs(terminology_save_dir, exist_ok=True)

category_title = "Association football terminology"

print(f"\nFetching category: {category_title}")

terminology_pages = get_category_members(
    category_title,
    depth=1,                   
)

print(f"Found {len(terminology_pages)} pages in category '{category_title}'.")

for title in terminology_pages:
    save_page(title, terminology_save_dir)
    time.sleep(0.3)



Fetching category: Association football terminology
Exploring subcategory: Category:Hat-trick (association football)
Exploring subcategory: Category:Nicknamed groups of association football players
Found 253 pages in category 'Association football terminology'.
Saved: Association football
Saved: Glossary of association football terms
Saved: Association football tactics and skills
Saved: 2–0 lead is the worst lead
Saved: 4th place trophy
Saved: 6+5 rule
Saved: 12th man (football)
Saved: Administration (British football)
Saved: All-seater stadium
Saved: Angeball
Saved: Anti-football
Saved: Apertura and Clausura
Saved: Article 52 of NOIF
Saved: Assist (association football)
Saved: Assistant referee (association football)
Saved: Away goals rule
Saved: Back-pass rule
Saved: Ball boy
Saved: Ball in and out of play
Saved: Ballet Azul
Saved: The beautiful game
Saved: Best and fairest
Saved: Bicycle kick
Saved: Big Five (Argentine football)
Saved: Big Four (Mexico)
Saved: Big Six (Premier Leag