The objective is to automate the extraction of HTML content, article titles, text, and internal links from Wikipedia pages into a consolidated function that accepts any Wikipedia URL for efficient data retrieval and processing.


Instructions

Create a Python script to automate data extraction from Wikipedia pages. The script will retrieve HTML content, extract article titles and text, collect internal links, and consolidate these tasks into one function that accepts a Wikipedia URL. This will be tested on a specific Wikipedia page to validate functionality.

1) Write a function to Get and parse html content from a Wikipedia page

2) Write a function to Extract article title

3) Write a function to Extract article text for each paragraph with their respective

headings. Map those headings to their respective paragraphs in the dictionary.

4) Write a function to collect every link that redirects to another Wikipedia page

5) Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

6) Test the last function on a Wikipedia page of your choice

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# 1. Get and parse HTML content from a Wikipedia page
def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        raise Exception(f"Failed to retrieve content. Status code: {response.status_code}")

# 2. Extract article title
def extract_title(soup):
    return soup.find("h1", {"id": "firstHeading"}).text.strip()

# 3. Extract article text with headings mapped to paragraphs
def extract_headings_and_paragraphs(soup):
    content_div = soup.find("div", {"class": "mw-parser-output"})
    sections = {}
    current_heading = "Introduction"
    sections[current_heading] = []

    for element in content_div.find_all(['h2', 'h3', 'p'], recursive=False):
        if element.name in ['h2', 'h3']:
            span = element.find("span", class_="mw-headline")
            if span:
                current_heading = span.text.strip()
                sections[current_heading] = []
        elif element.name == 'p':
            paragraph = element.get_text(strip=True)
            if paragraph:
                sections[current_heading].append(paragraph)

    # Combine list of paragraphs into single strings per heading
    return {heading: "\n".join(paragraphs) for heading, paragraphs in sections.items()}

# 4. Collect every link that redirects to another Wikipedia page
def extract_internal_links(soup):
    links = set()
    for link in soup.find_all("a", href=True):
        href = link['href']
        if href.startswith("/wiki/") and not any(prefix in href for prefix in [":", "#"]):
            full_url = urljoin("https://en.wikipedia.org", href)
            links.add(full_url)
    return list(links)

# 5. Wrap all into one function
def extract_wikipedia_data(url):
    soup = fetch_html(url)
    title = extract_title(soup)
    content = extract_headings_and_paragraphs(soup)
    internal_links = extract_internal_links(soup)
    
    return {
        "title": title,
        "content": content,
        "internal_links": internal_links
    }

# 6. Test the function
if __name__ == "__main__":
    test_url = "https://en.wikipedia.org/wiki/Web_scraping"
    data = extract_wikipedia_data(test_url)
    
    print("Title:", data["title"])
    print("\nContent Preview:")
    for heading, paragraph in list(data["content"].items())[:3]:
        print(f"\n## {heading}:\n{paragraph[:300]}...")  # Preview first 300 chars
    print(f"\nTotal internal links found: {len(data['internal_links'])}")


Title: Web scraping

Content Preview:

## Introduction:
...

Total internal links found: 133
