In [8]:
import wikipediaapi
import json
import requests
from bs4 import BeautifulSoup

In [None]:
def fetch_wikipedia_page(title, lang="en"):
    ''' Fetch wiki page using Wikipedia API
    '''
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='your-user-agent', language=lang)
    page = wiki_wiki.page(title)
    if not page.exists():
        raise ValueError(f"Page '{title}' does not exist.")
    return page

def extract_sections(page, level=0):
    '''Extract section from scrap
    '''
    sections = {}
    for section in page.sections:
        sections[section.title] = {
            "level": level,
            "text": section.text,
            "subsections": extract_sections(section, level + 1)
        }
    return sections

def extract_references(title):
    ''' Extract reference only
    '''
    url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    references = []
    for ref in soup.select("ol.references li cite"):
        ref_text = ref.get_text(separator=" ", strip=True)
        references.append(ref_text)

    return references

def scrape_wikipedia(title, lang="en"):
    ''' Wrapper to call all helper methods
    '''
    page = fetch_wikipedia_page(title, lang)
    data = {
        "title": page.title,
        "summary": page.summary,
        "sections": extract_sections(page),
        "references": extract_references(title),
        "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
    }
    return data

In [15]:
title = "Nuclear fission"
data = scrape_wikipedia(title)

# Save to JSON
with open(f"{title.replace(' ', '_')}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

print(f"Data saved to {title.replace(' ', '_')}.json")

Data saved to Nuclear_fission.json


In [16]:
references = extract_references(title)