# Graded Exercise 2



## Part 2: Gathering Links


### 2.1. Library Installation

In [37]:
import requests
from bs4 import BeautifulSoup

### 2.2. Retrieve HTML Content

In [38]:
response = requests.get('https://www.bbc.com/news/world/europe')
contents = response.text
soup = BeautifulSoup(contents, 'html.parser')

### 2.3. Extract Articles

In [39]:
def extract_article_links(contents):

    soup = BeautifulSoup(contents, 'html.parser')
    
    articles = soup.find_all('div', attrs={'type': 'article'})

    article_links = []
    
    for article in articles:
        link = article.find('a')['href']
        
        article_links.append(link)

    return articles, article_links

articles, article_links = extract_article_links(contents)

print(len(articles))
print(len(article_links))
extract_article_links(contents)

38
38


([<div class="ssrcss-18mhvre-Promo e1vyq2e80" data-testid="promo" type="article"><div class="ssrcss-cmbgq-PromoSwitchLayoutAtBreakpoints et5qctl0"><div class="ssrcss-tq7xfh-PromoContent exn3ah99"><div class="ssrcss-1f3bvyz-Stack e1y4nx260" spacing="2"><a class="ssrcss-afqep1-PromoLink exn3ah91" href="/news/world-europe-68479836"><span role="text"><p class="ssrcss-15dlehh-PromoHeadline exn3ah96"><span aria-hidden="false">Singapore sting: How spies listened in on German general</span></p></span></a><p class="ssrcss-1q0x1qg-Paragraph e1jhz7w10">Berlin said human error was to blame for the intercept of conversations between top military officers.</p><div class="ssrcss-17ojimx-AdditionalLinksWrapper exn3ah90"><ul class="ssrcss-q23uto-Stack e1y4nx260" role="list" spacing="0"><li class="ssrcss-1k4gdup-StyledAdditionalLink efy7ugc1" type="article"><a class="ssrcss-1eakgrr-StyledLink efy7ugc2" href="/news/world-europe-68467333"><div class="ssrcss-1k5s4ar-IconContainer efy7ugc3"><svg aria-hidden

### 2.4. Scrape Multiple Pages

In [40]:
def all_article_links(soup):
    max_pages = soup.find_all('div', class_="ssrcss-3vkeha-StyledButtonContent e1kcrsdk1")[-2]
    max_page_number = int(max_pages.text.strip())

    all_links = []

    for page_number in range(1, max_page_number + 1):
        page_url = f"https://www.bbc.com/news/world/europe?page={page_number}"
        response = requests.get(page_url)
        contents = response.text
        soup = BeautifulSoup(contents, 'html.parser')
        articles = soup.find_all('div', attrs={'type': 'article'})
        links = [article.find('a')['href'] for article in articles]
        all_links.extend(links)
    return max_page_number, all_links

response = requests.get('https://www.bbc.com/news/world/europe')
contents = response.text
soup = BeautifulSoup(contents, 'html.parser')
max_page_number, links = all_article_links(soup)

print("Number of pages available in the 'Europe' section:", max_page_number)
print("Number of article links from all pages in 'Europe' section:", len(links))

Number of pages available in the 'Europe' section: 42
Number of article links from all pages in 'Europe' section: 905


### 2.5. Expand the Scope

In [41]:
def all_article_links(region):
    region_urls = {
        'Europe': 'https://www.bbc.com/news/world/europe',
        'Australia': 'https://www.bbc.com/news/world/australia',
        'Asia': 'https://www.bbc.com/news/world/asia',
        'Africa': 'https://www.bbc.com/news/world/africa',
        'Latin America': 'https://www.bbc.com/news/world/latin_america',
        'Middle East': 'https://www.bbc.com/news/world/middle_east'
    }

    region_url = region_urls.get(region)

    response = requests.get(region_url)
    contents = response.text
    soup = BeautifulSoup(contents, 'html.parser')
    max_pages = soup.find_all('div', class_="ssrcss-3vkeha-StyledButtonContent e1kcrsdk1")[-2]
    max_page_number = int(max_pages.text.strip())

    all_links = []

    for page_number in range(1, max_page_number + 1):
        page_url = f"{region_url}?page={page_number}"
        response = requests.get(page_url)
        contents = response.text
        soup = BeautifulSoup(contents, 'html.parser')
        articles = soup.find_all('div', attrs={'type': 'article'})
        links = [article.find('a')['href'] for article in articles]
        all_links.extend(links)

    return max_page_number, all_links

regions = ['Europe', 'Australia', 'Asia', 'Africa', 'Latin America', 'Middle East']

total_links = []
total_pages = 0

for region in regions:
    max_page_number, links = all_article_links(region)
    if max_page_number is not None:
        print(f"Number of pages available in the '{region}' section:", max_page_number)
        print(f"Number of article links from all pages in '{region}' section:", len(links))
        total_links.extend(links) 
        total_pages += max_page_number

print("Total number of article links across all regions:", len(total_links))
print("Total number of pages across all regions:", total_pages)

Number of pages available in the 'Europe' section: 42
Number of article links from all pages in 'Europe' section: 905
Number of pages available in the 'Australia' section: 42
Number of article links from all pages in 'Australia' section: 827
Number of pages available in the 'Asia' section: 42
Number of article links from all pages in 'Asia' section: 907
Number of pages available in the 'Africa' section: 25
Number of article links from all pages in 'Africa' section: 492
Number of pages available in the 'Latin America' section: 42
Number of article links from all pages in 'Latin America' section: 845
Number of pages available in the 'Middle East' section: 41
Number of article links from all pages in 'Middle East' section: 818
Total number of article links across all regions: 4794
Total number of pages across all regions: 234


#### 2.6. Save Results

In [42]:
import csv

csv_file = 'article_links.csv'

with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Article Links'])
    writer.writerows([[link] for link in total_links])

print("Article links saved to:", csv_file)

Article links saved to: article_links.csv
