In [None]:
!pip install beautifulsoup4



In [None]:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd

## Exercise 1 : Parsing HTML With BeautifulSoup

**Instructions**

**Objective:** Use urlopen() to fetch the HTML content of a webpage and then parse it using BeautifulSoup.

- Read the HTML content of the page.
- Create a BeautifulSoup object to parse this HTML.
- Find the title of the webpage (the content inside the `<title>` tag)
- Extract all paragraphs (`<p>` tags) from the page.
- Retrieve all links (URLs in `<a href="">` tags) on the page.

In [None]:
url ="/content/index.html"
with open(url, "r") as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")

In [None]:
title = soup.find("title")
paragraphs = soup.find_all("p")
links = soup.find_all("a")
print(title.text, paragraphs, links)

Sports World [<p>Your one-stop destination for the latest sports news and videos.</p>, <p>Read about the latest football matches and player news.</p>, <p>Watch highlights from the latest NBA games.</p>, <p>Get the latest updates from the world of Grand Slam tennis.</p>] [<a href="#football">Football</a>, <a href="#basketball">Basketball</a>, <a href="#tennis">Tennis</a>]


## Exercise 2 : Scraping Robots.Txt From Wikipedia

**Instructions**

Write a Python program to download and display the content of robot.txt for en.wikipedia.org

In [None]:
url = "https://en.wikipedia.org/robots.txt"

with urllib.request.urlopen(url) as f:
    robots = f.read()
print(robots)

b'\xef\xbb\xbf# robots.txt for http://www.wikipedia.org/ and friends\n#\n# Please note: There are a lot of pages on this site, and there are\n# some misbehaved spiders out there that go _way_ too fast. If you\'re\n# irresponsible, your access to the site may be blocked.\n#\n\n# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN\n# and ignoring 429 ratelimit responses, claims to respect robots:\n# http://mj12bot.com/\nUser-agent: MJ12bot\nDisallow: /\n\n# advertising-related bots:\nUser-agent: Mediapartners-Google*\nDisallow: /\n\n# Wikipedia work bots:\nUser-agent: IsraBot\nDisallow:\n\nUser-agent: Orthogaffe\nDisallow:\n\n# Crawlers that are kind enough to obey, but which we\'d rather not have\n# unless they\'re feeding search engines.\nUser-agent: UbiCrawler\nDisallow: /\n\nUser-agent: DOC\nDisallow: /\n\nUser-agent: Zao\nDisallow: /\n\n# Some bots are known to be trouble, particularly those designed to copy\n# entire sites. Please obey robots.txt.\nUser-agent:

## Exercise 3 : Extracting Headers From Wikipedia’s Main Page

**Instructions**

Write a Python program to extract and display all the header tags from en.wikipedia.org/wiki/Main_Page.



In [None]:
url = "https://en.wikipedia.org/wiki/Main_Page/"

with urllib.request.urlopen(url) as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
headers = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
for header in headers:
    print(header.name, ":", header.text)

h1 : Main Page
h1 : Welcome to Wikipedia
h2 : From today's featured article
h2 : Did you know ...
h2 : In the news
h2 : On this day
h2 : From today's featured list
h2 : Today's featured picture
h2 : Other areas of Wikipedia
h2 : Wikipedia's sister projects
h2 : Wikipedia languages


## Exercise 4 : Checking For Page Title

**Instructions**

Write a Python program to check whether a page contains a title or not.

In [None]:
url = "https://en.wikipedia.org/wiki/Main_Page/"

with urllib.request.urlopen(url) as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
title = soup.find("title")
if title is None:
    print("No title found")
else:
    print(title.text)

Wikipedia, the free encyclopedia


## Exercise 5 : Analyzing US-CERT Security Alerts

**Instructions**

Write a Python program get the number of security alerts issued by US-CERT in the current year.\
Source: https://www.us-cert.gov/ncas/alerts

In [None]:
flag = True
number_of_alerts = 0
counter = 0
while flag:
  url = "https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page="+ str(counter)

  with urllib.request.urlopen(url) as f:
      page = f.read()
  soup = BeautifulSoup(page, "html.parser")
  alerts = soup.find_all('div', {'class':'c-teaser__content'})
  number_of_alerts += len(alerts)
  counter += 1
  if len(alerts) == 0:
    flag = False
print(f'the number of security alerts issued by US-CERT in the current year: {number_of_alerts}')

the number of security alerts issued by US-CERT in the current year: 213


## Exercise 6 : Scraping Movie Details
**Instructions**

Write a Python program to get movie name, year and a brief summary of the top 10 random movies.

In [None]:
url = "https://www.rottentomatoes.com/browse/movies_at_home/sort:popular"

with urllib.request.urlopen(url) as f:
    page = f.read()

soup = BeautifulSoup(page, "html.parser")
movies = soup.find_all('span', {'data-qa': "discovery-media-list-item-title"})
critics_score = soup.find_all('rt-text', {'slot': 'criticsScore'})
audience_score = soup.find_all('rt-text', {'slot': 'audienceScore'})
release_date = soup.find_all('span', {'data-qa': 'discovery-media-list-item-start-date'})
movies_data = {
    "movie_name": [name.text for name in movies],
    "critics_score": [score.text for score in critics_score],
    "audience_score": [score.text for score in audience_score],
    "release_date": [date.text for date in release_date]
}

import pandas as pd
df = pd.DataFrame(movies_data)
df.head(10)

Unnamed: 0,movie_name,critics_score,audience_score,release_date
0,\n Hillbilly Elegy\n,25%,82%,"\n Streaming Nov 24, 2020\n"
1,\n Divorce in the Black\n,0%,74%,"\n Streaming Jul 11, 2024\n"
2,\n Hit Man\n,95%,91%,"\n Streaming Jun 7, 2024\n"
3,\n Horizon: An American Saga - Chapte...,47%,70%,"\n Streaming Jul 16, 2024\n"
4,\n Thelma\n,99%,84%,"\n Streaming Jul 19, 2024\n"
5,\n The Bikeriders\n,81%,74%,"\n Streaming Jul 9, 2024\n"
6,\n The Long Game\n,81%,97%,"\n Streaming Apr 30, 2024\n"
7,\n Land of Bad\n,65%,94%,"\n Streaming Mar 19, 2024\n"
8,\n The Imaginary\n,91%,80%,"\n Streaming Jul 5, 2024\n"
9,\n Young Woman and the Sea\n,88%,98%,"\n Streaming Jul 19, 2024\n"
