In [None]:
!pip install beautifulsoup4 lxml --quiet

In [None]:
from bs4 import BeautifulSoup

# Assign the HTML content from the previous cell to html_content
html_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>
"""

# --- Step 2: Create a BeautifulSoup object ---
soup = BeautifulSoup(html_content, "html.parser")

In [None]:
# --- Step 3: Find the title of the webpage ---
title = soup.title.string

In [None]:
# --- Step 4: Extract all paragraphs (<p> tags) ---
paragraphs = [p.get_text() for p in soup.find_all('p')]

In [None]:
# --- Step 5: Retrieve all links (<a href="">) ---
links = [a['href'] for a in soup.find_all('a', href=True)]

In [None]:
# --- Display results ---
print("Page Title:", title)
print("Paragraphs:", paragraphs)
print("Links:", links)

In [None]:
# Exercise 2 : Scraping robots.txt from Wikipediaё
import requests

url = "https://en.wikipedia.org/robots.txt"

# Add a browser-like User-Agent header
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/118.0.5993.70 Safari/537.36"
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print(" Wikipedia robots.txt content:\n")
    print(response.text)
else:
    print(f" Failed to retrieve robots.txt. Status code: {response.status_code}")


In [None]:
# Exercise 3 : Extracting Headers from Wikipedia’s Main Page
# URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/Wikipedia"

# Add headers to avoid 403 Forbidden
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/118.0.5993.70 Safari/537.36"
}

# Fetch the page content
response = requests.get(url, headers=headers)

if response.status_code == 200:
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all header tags (h1–h6)
    headers = []
    for i in range(1, 7):
        for tag in soup.find_all(f"h{i}"):
            headers.append((f"h{i}", tag.get_text(strip=True)))

    # Display the results
    print("Header tags found on the page:\n")
    for tag_name, text in headers:
        print(f"{tag_name}: {text}")

else:
    print(f"Failed to retrieve page. Status code: {response.status_code}")


In [None]:
#  Exercise 4 : Checking for Page Title
# Find all header tags (h1–h6)
all_headers = []
for i in range(1, 7):
    tags = soup.find_all(f"h{i}")
    for tag in tags:
        all_headers.append((f"h{i}", tag.get_text(strip=True)))

# Display results
print("Header tags found on the page:\n")
for tag_name, text in all_headers:
    print(f"{tag_name}: {text}")


In [9]:
# Exercise 4 : Checking for Page Title
from bs4 import BeautifulSoup

# Suppose 'html_content' already contains the loaded HTML page
html_content = """
<!DOCTYPE html>
<html>
<head>
    <title>Example Page</title>
</head>
<body>
    <h1>Hello World!</h1>
</body>
</html>
"""

# Create BeautifulSoup object
soup = BeautifulSoup(html_content, "html.parser")

# Check if a title tag exists
title_tag = soup.title

if title_tag and title_tag.string:
    print("The page contains a title:", title_tag.string)
else:
    print("The page does not contain a title tag.")


The page contains a title: Example Page


In [10]:
# Exercise 5 : Analyzing US-CERT Security Alerts
import datetime

def count_cisa_alerts(year=None, page_limit=10):
    if year is None:
        year = datetime.datetime.now().year
    base_url = 'https://www.cisa.gov/news-events/cybersecurity-advisories'
    # this query parameter filters to “alert” type (advisory_type=93)
    params = {
        'f[0]': 'advisory_type:93',
        'page': 0
    }
    count = 0
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/118.0.5993.70 Safari/537.36"
    })
    for page in range(page_limit):
        params['page'] = page
        resp = session.get(base_url, params=params)
        if resp.status_code != 200:
            print(f"Failed to fetch page {page}, status code {resp.status_code}")
            break
        soup = BeautifulSoup(resp.text, "html.parser")
        # find all entries dated in the given year
        articles = soup.select('article.views-row')  # adjust selector if needed
        if not articles:
            break
        for art in articles:
            # extract date
            date_tag = art.select_one('.views-field-created .field-content')
            if not date_tag:
                continue
            # parse date somewhere like "Sep 23, 2025"
            try:
                date_str = date_tag.get_text(strip=True)
                date_obj = datetime.datetime.strptime(date_str, "%b %d, %Y")
            except Exception:
                continue
            if date_obj.year == year:
                count += 1
            elif date_obj.year < year:
                # assuming entries are sorted by date descending, we can stop
                return count
        # next page
    return count

if __name__ == "__main__":
    year = datetime.datetime.now().year
    num_alerts = count_cisa_alerts(year=year, page_limit=20)
    print(f"Number of CISA “Alert”-type entries for {year}: {num_alerts}")


Number of CISA “Alert”-type entries for 2025: 0


In [11]:
# Exercise 6 : Scraping Movie Details
import random
import time

LIST_URL = "https://www.imdb.com/list/ls091294718/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/118.0.5993.70 Safari/537.36"
}

def get_movie_links_from_list():
    """Fetch the list page and return all movie detail URLs from that list."""
    resp = requests.get(LIST_URL, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    links = []
    for a in soup.select("h3.lister-item-header a"):
        href = a.get("href")
        if href and href.startswith("/title/"):
            links.append("https://www.imdb.com" + href)
    return links

def extract_movie_details(url):
    """Given a movie detail page URL, return (title, year, summary)."""
    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Title and year
    title_tag = soup.find("h1")
    if not title_tag:
        return None
    title_text = title_tag.get_text(strip=True)
    # IMDb often has the year in a span or appended; clean up:
    year_span = soup.find("span", id="titleYear")
    year = year_span.get_text(strip=True).strip("()") if year_span else ""

    # Summary / plot
    summary_tag = soup.select_one("div.summary_text")
    if not summary_tag:
        # alternative: look for <span class="GenresAndPlot__TextContainerBreakpointXS_TO_M-cum89p-0 dcFkRD"> etc
        summary = ""
    else:
        summary = summary_tag.get_text(strip=True)

    return {
        "title": title_text,
        "year": year,
        "summary": summary
    }

def main():
    links = get_movie_links_from_list()
    print(f"Found {len(links)} movies in the list.")
    chosen = random.sample(links, min(10, len(links)))
    results = []
    for idx, url in enumerate(chosen, start=1):
        print(f"Fetching {idx}/10: {url}")
        details = extract_movie_details(url)
        if details:
            results.append(details)
        else:
            print("Failed to extract details for:", url)
        time.sleep(1)  # polite delay
    print("\n=== Selected Movies ===")
    for movie in results:
        print(f"Title: {movie['title']}")
        print(f"Year:  {movie['year']}")
        print(f"Summary: {movie['summary']}")
        print("-" * 40)

if __name__ == "__main__":
    main()


Found 0 movies in the list.

=== Selected Movies ===
