In [1]:
# --- Standard Library ---
import os
import re
import io
import sys
import time
import json
import math
import html
import random
import pathlib
import zipfile
import urllib.request
import urllib.parse
from collections import Counter, defaultdict

# --- Third-party libraries ---
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib as mpl
import matplotlib.patches as mpatches
import networkx as nx
from bs4 import BeautifulSoup
from wordcloud import WordCloud

# --- NLP / Text ---
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords, words, wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

# --- NLTK Downloads ---
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('words')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\astri\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!

True

In [None]:
# We will save the text about each performer as a txt file in the folder "performers"
save_dir = "managers"
os.makedirs(save_dir, exist_ok=True)

# wiki api
baseurl = "https://en.wikipedia.org/w/api.php"
user_agent = "PremierLeagueManagerDownloader/1.0"

def get_category_members(category_title, depth=1):
    """
    Fetch all pages (and optionally subcategories) under a Wikipedia category.
    depth=1 means include one level of subcategories.
    """
    
    #Skip Premier League Manager...... and List of Premier League managers
    members = []
    cmcontinue = ""
    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category_title}",
            "cmlimit": "max",
            "format": "json"
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        query = baseurl + "?" + urllib.parse.urlencode(params)
        req = urllib.request.Request(query, headers={"User-Agent": user_agent})
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode("utf-8"))

        pages = data.get("query", {}).get("categorymembers", [])
        for p in pages:
            ns = p.get("ns")
            title = p.get("title")
            if ns == 0:
                members.append(title)
            elif ns == 14 and depth > 0:  # category namespace
                print(f"→ Exploring subcategory: {title}")
                members += get_category_members(title.replace("Category:", ""), depth=depth - 1)

        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.3)
        else:
            break
    return members

def fetch_wikitext(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "titles": title,
        "format": "json"
    }
    query = baseurl + "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(query, headers={"User-Agent": user_agent})
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))

    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    try:
        return page["revisions"][0]["slots"]["main"]["*"]
    except (KeyError, IndexError):
        return None

In [None]:
# --- MAIN EXECUTION ---
category = "Premier League managers"
print(f"Fetching managers from category: {category}")
managers = get_category_members(category, depth=1)
print(f"Found {len(managers)} manager pages")

In [None]:
# saves the texts in the performers folder
for manager in managers:
    safe_title = manager.replace(" ", "_").replace("/", "_")
    wikitext = fetch_wikitext(manager)
    if not wikitext:
        print(f"Skipping {manager}, no text found")
        continue

    filename = os.path.join(save_dir, f"{safe_title}.txt")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(wikitext)

    print(f"Saved {manager} → {filename}")
    time.sleep(0.5)  # be polite to Wikipedia

In [None]:
pages = [
    "Glossary of association football terms",
    "Association football tactics and skills",
    "Formation (association football)",
    "Association football positions",
    "Association football",
]

In [None]:
# Folder to save pages
save_dir = "football_pages"
os.makedirs(save_dir, exist_ok=True)

# Wikipedia API setup
baseurl = "https://en.wikipedia.org/w/api.php"
user_agent = "FootballPageDownloader/1.0"
# 
def fetch_wikitext(title):
    """Fetch full Wikipedia page content (wikitext) for a given title."""
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "content",
        "titles": title,
        "format": "json"
    }
    query = baseurl + "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(query, headers={"User-Agent": user_agent})
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))

    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    try:
        return page["revisions"][0]["slots"]["main"]["*"]
    except (KeyError, IndexError):
        return None

def save_txt(title, folder):
    """Fetch a Wikipedia page and save it as a text file in the given folder."""
    text = fetch_wikitext(title)
    if not text:
        print(f"⚠️ Skipped {title}: no text found.")
        return
    safe_name = title.replace("/", "_").replace(":", "_")
    path = os.path.join(folder, f"{safe_name}.txt")
    with open(path, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"✅ Saved {title}")

# Example: list of mentality / tactical / philosophy pages


In [None]:
# Fetch and save all pages
for title in pages:
    save_txt(title, save_dir)
    time.sleep(0.5)  # be kind to the API

In [None]:
save_dir = "football_terminology"
os.makedirs(save_dir, exist_ok=True)

baseurl = "https://en.wikipedia.org/w/api.php"
user_agent = "FootballTerminologyDownloader/1.0"

In [None]:
# --- Fetch category pages ---
def get_category_members(category_title, depth=0):
    """Fetch all pages (and optionally subcategories) under a Wikipedia category."""
    members = []
    cmcontinue = ""
    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category_title}",
            "cmlimit": "max",
            "format": "json"
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        query = baseurl + "?" + urllib.parse.urlencode(params)
        req = urllib.request.Request(query, headers={"User-Agent": user_agent})
        with urllib.request.urlopen(req) as response:
            data = json.loads(response.read().decode("utf-8"))

        pages = data.get("query", {}).get("categorymembers", [])
        for p in pages:
            ns = p.get("ns")
            title = p.get("title")
            if ns == 0:
                members.append(title)
            elif ns == 14 and depth > 0:
                print(f"→ Exploring subcategory: {title}")
                members += get_category_members(title.replace("Category:", ""), depth - 1)

        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.3)
        else:
            break
    return members


# --- Fetch full page text ---
def fetch_wikitext(title):
    params = {
        "action": "query",
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "content",
        "titles": title,
        "format": "json"
    }
    query = baseurl + "?" + urllib.parse.urlencode(params)
    req = urllib.request.Request(query, headers={"User-Agent": user_agent})
    with urllib.request.urlopen(req) as response:
        data = json.loads(response.read().decode("utf-8"))
    pages = data["query"]["pages"]
    page = next(iter(pages.values()))
    try:
        return page["revisions"][0]["slots"]["main"]["*"]
    except (KeyError, IndexError):
        return None


In [None]:
category = "Association football terminology"
pages = get_category_members(category, depth=1)  # include subcategories too

print(f"Found {len(pages)} pages in category '{category}'.")

for title in pages:
    save_txt(title, save_dir)
    time.sleep(0.3)