<a href="https://colab.research.google.com/github/ada-presh/My-Projects/blob/main/webscrapecheckpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# 1. Get and parse HTML content
def get_html_content(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve page: {response.status_code}")
    return BeautifulSoup(response.content, 'html.parser')

# 2. Extract article title
def extract_title(soup):
    return soup.find('h1', id='firstHeading').text.strip()

# 3. Extract article text with headings
def extract_text_with_headings(soup):
    content_div = soup.find('div', class_='mw-parser-output')
    text_data = {}
    current_heading = None

    for tag in content_div.find_all(['h2', 'h3', 'p']):
        if tag.name in ['h2', 'h3']:
            heading = tag.text.replace('[edit]', '').strip()
            current_heading = heading
            text_data[current_heading] = []
        elif tag.name == 'p':
            if current_heading:
                text_data[current_heading].append(tag.text.strip())
            else:
                text_data.setdefault('Introduction', []).append(tag.text.strip())

    # Convert lists of paragraphs into a single string
    for key in text_data:
        text_data[key] = '\n'.join(text_data[key])
    return text_data

# 4. Extract internal Wikipedia links
def extract_internal_links(soup):
    base_url = "https://en.wikipedia.org"
    links = set()
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('/wiki/') and ':' not in href:
            full_url = urljoin(base_url, href)
            links.add(full_url)
    return list(links)

# 5. Wrapper function
def scrape_wikipedia_page(url):
    soup = get_html_content(url)
    title = extract_title(soup)
    text_sections = extract_text_with_headings(soup)
    internal_links = extract_internal_links(soup)

    return {
        'title': title,
        'text_sections': text_sections,
        'internal_links': internal_links
    }

# 6. Test the function on a sample Wikipedia page
if __name__ == "__main__":
    test_url = "https://en.wikipedia.org/wiki/Web_scraping"
    data = scrape_wikipedia_page(test_url)

    print(f"Title: {data['title']}\n")
    print("Sections and Paragraphs:")
    for section, text in data['text_sections'].items():
        print(f"\n== {section} ==\n{text[:300]}...")  # Show only first 300 characters

    print(f"\nNumber of Internal Links Found: {len(data['internal_links'])}")
    print("Sample Links:", data['internal_links'][:5])


Title: Web scraping

Sections and Paragraphs:

== Introduction ==
Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites.[1] Web scraping software may directly access the World Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term t...

== History ==
After the birth of the World Wide Web in 1989, the first web robot,[2] World Wide Web Wanderer, was created in June 1993, which was intended only to measure the size of the web.
In December 1993, the first crawler-based web search engine, JumpStation, was launched. As there were fewer websites avail...

== Techniques ==
Web scraping is the process of automatically mining data or collecting information from the World Wide Web. It is a field with active developments sharing a common goal with the semantic web vision, an ambitious initiative that still requires breakthroughs in text processing, semantic underst