In [None]:
# Lightly edited web-scraping script from ChatGPT (o3 mini)

import requests
from bs4 import BeautifulSoup
import re
import json

def fetch_category_members(category, limit=500):
    """
    Fetches all page titles in a given MediaWiki category.
    """
    url = 'https://millionaire.fandom.com/api.php'
    params = {
        'action': 'query',
        'list': 'categorymembers',
        'cmtitle': f'Category:{category}',
        'cmlimit': limit,
        'format': 'json'
    }
    titles = []
    while True:
        resp = requests.get(url, params=params).json()
        members = resp['query']['categorymembers']
        titles.extend([m['title'] for m in members])
        if 'continue' in resp:
            params.update(resp['continue'])
        else:
            break
    return titles


def fetch_page_html(title):
    """
    Retrieves the rendered HTML content for a given page title.
    """
    url = 'https://millionaire.fandom.com/api.php'
    params = {
        'action': 'parse',
        'page': title,
        'prop': 'text',
        'format': 'json'
    }
    resp = requests.get(url, params=params).json()
    return resp['parse']['text']['*']

def parse_questions(html):
    """
    Parses HTML content to extract question texts.
    Supports both numbered lists and table-based formats.
    """
    soup = BeautifulSoup(html, 'html.parser')
    questions = []

    for tr in soup.find_all('tr', style=re.compile('background-color:\\s*#000000', re.I)):
      td = tr.find('td', colspan="2")
      if not td:
        continue
      b = td.find_all('b')
      if len(b) == 1:
        questions.append(b[0].get_text())

    return questions


def scrape_contestant_questions(category_name):
    """
    Main function to scrape all contestants in the category and their questions.
    """
    contestants = fetch_category_members(category_name)
    all_data = {}
    for title in contestants:
        print(f"Scraping: {title}")
        html = fetch_page_html(title)
        questions = parse_questions(html)
        all_data[title] = questions
    return all_data


if __name__ == '__main__':
    category = 'Contestants_from_the_U.S.'
    data = scrape_contestant_questions(category)
    # Save to JSON file
    with open('millionaire_questions.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Scraped questions for {len(data)} contestants. Output saved to 'millionaire_questions.json'.")

In [None]:
# Take a look at the output here
import json

with open('millionaire_questions.json', 'r') as file:
    data = json.load(file)

all_questions = [x for qs in data.values() for x in qs]
print('total questions:', len(all_questions))
print('word count:', len(' '.join(all_questions).split()))

total questions: 25865
word count: 430258
