# CheckPoint Web Scraping

In [None]:
#After watching this video below, you will be able to:

#https://www.youtube.com/watch?v=YY5skv756pc

#1.1) Write a function to Get and parse html content from a Wikipedia page

#1.2) Write a function to Extract article title

#1.3) Write a function to Extract article text for each paragraph with their respective

#headings. Map those headings to their respective paragraphs in the dictionary.

#1.4) Write a function to collect every link that redirects to another Wikipedia page

#1.5) Wrap all the previous functions into a single function that takes as parameters a Wikipedia link

#1.6) Test the last function on a Wikipedia page of your choice

In [1]:
import requests
from bs4 import BeautifulSoup


def get_html_content(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        raise Exception(f"Failed to retrieve HTML content. Status code: {response.status_code}")


def extract_article_title(html):
    soup = BeautifulSoup(html, 'html.parser')
    title_element = soup.find('h1', {'id': 'firstHeading'})
    if title_element:
        return title_element.text.strip()
    else:
        raise Exception("Failed to extract article title.")


def extract_article_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    content_element = soup.find('div', {'id': 'mw-content-text'})
    if content_element:
        paragraphs = content_element.find_all('p', recursive=False)
        text_map = {}

        current_heading = None
        for paragraph in paragraphs:
            if paragraph.find('span', {'class': 'mw-headline'}):
                current_heading = paragraph.text.strip()
                text_map[current_heading] = []
            elif current_heading:
                text_map[current_heading].append(paragraph.text.strip())

        return text_map
    else:
        raise Exception("Failed to extract article text.")


def collect_redirect_links(html):
    soup = BeautifulSoup(html, 'html.parser')
    link_elements = soup.find_all('a', {'class': 'mw-redirect'})
    redirect_links = []
    for link in link_elements:
        href = link.get('href')
        if href.startswith('/wiki/'):
            redirect_links.append(href)

    return redirect_links


def process_wikipedia_page(url):
    html = get_html_content(url)
    title = extract_article_title(html)
    text_map = extract_article_text(html)
    redirect_links = collect_redirect_links(html)

    result = {
        'title': title,
        'text': text_map,
        'redirect_links': redirect_links
    }
    return result


# Example usage:
wikipedia_link = "https://en.wikipedia.org/wiki/OpenAI"
result = process_wikipedia_page(wikipedia_link)
print("Article Title:", result['title'])
print("Article Text:")
for heading, paragraphs in result['text'].items():
    print(heading)
    for paragraph in paragraphs:
        print(paragraph)
    print("---")
print("Redirect Links:", result['redirect_links'])


Article Title: OpenAI
Article Text:
Redirect Links: ['/wiki/Friendly_AI', '/wiki/AI_control_problem', '/wiki/YC_research', '/wiki/DeepMind', '/wiki/Fiduciary_duty', '/wiki/Y_Combinator_(company)', '/wiki/Stripe_(company)', '/wiki/Y_Combinator_(company)', '/wiki/Intelligence_explosion', '/wiki/Amazon.com', '/wiki/DeepMind', '/wiki/ImageNet_Large_Scale_Visual_Recognition_Challenge', '/wiki/Dendi_(Dota_player)', '/wiki/Explainable_AI', '/wiki/Domain_randomization', '/wiki/Application_programming_interface', '/wiki/Upvotes', '/wiki/Parameter_(machine_learning)', '/wiki/Orders_of_magnitude', '/wiki/Application_programming_interface', '/wiki/Autocompletion', '/wiki/ISSN_(identifier)', '/wiki/Mercury_News', '/wiki/ArXiv_(identifier)', '/wiki/ArXiv_(identifier)', '/wiki/ArXiv_(identifier)', '/wiki/ArXiv_(identifier)', '/wiki/ArXiv_(identifier)', '/wiki/Training,_validation,_and_test_sets', '/wiki/Residual_network', '/wiki/Intelligence_explosion']
