# Scraping Presidential and Vice Presidential Articles in Rappler.Com

## Importing Libraries

We import the necessary libraries for web scraping, handling HTTP requests, parsing HTML, and managing data storage and analysis.

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import re
import time
import csv
from collections import defaultdict

## Defining the Scraping Function

We define the scrape_article function with a retry mechanism for handling HTTP request failures.

In [None]:
def scrape_article(url):
    max_retries = 3
    base_delay = 2

## Handling HTTP Requests with Retries

This loop attempts to fetch the webpage, with exponential backoff for retrying if a request fails.

    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            break
        except requests.exceptions.RequestException as e:
            if response.status_code == 429:
                retry_after = response.headers.get("Retry-After")
                delay = int(retry_after) if retry_after else base_delay * (2 ** attempt) + random.uniform(0, 1)
                print(f"Attempt {attempt + 1} failed with 429. Retrying after {delay} seconds...")
                time.sleep(delay)
            elif attempt < max_retries - 1:
                print(f"Attempt {attempt + 1} failed. Retrying...")
                time.sleep(base_delay * (2 ** attempt) + random.uniform(0, 1))
            else:
                print(f"Maximum retries exceeded. Error: {e}")
                return None


## Parsing the HTML Content

We parse the HTML content to extract the article's title, body, keyword presence, and publication year.

    soup = BeautifulSoup(response.content, 'html.parser')
    
    title = soup.find('h1', class_='post-single__title').get_text(strip=True) if soup.find('h1', class_='post-single__title') else 'No title found'
    body = soup.find('div', class_='post-single__body').get_text(strip=True) if soup.find('div', class_='post-single__body') else ''
    has_marcos = 'Marcos' in body
    has_sara = 'Sara' in body
    pub_year = soup.find('time', class_='entry-date published post__timeago')['datetime'][:4] if soup.find('time', class_='entry-date published post__timeago') and soup.find('time', class_='entry-date published post__timeago').has_attr('datetime') else 'Unknown'
    
    return {
        'url': url,
        'title': title,
        'body': body,
        'has_marcos': has_marcos,
        'has_sara': has_sara,
        'pub_year': pub_year
    }


## List of Article URLs

We specify the URLs of the articles we want to scrape.

urls = [
    'https://www.rappler.com/philippines/marcos-updates-list-priority-measures-ledac-divorce-sogie-bills-excluded-june-2024/',
    'https://www.rappler.com/philippines/mindanao/sara-duterte-downplays-opposition-role-thinks-still-friends-with-marcos/',
    'https://www.rappler.com/voices/opinion-genuine-ilokano-reflections-marcos-loyalism/',
    'https://www.rappler.com/newsbreak/iq/stories-tracking-marcos-disinformation'
    # Too many URLs to list here
]


## Collecting Article Data

We scrape each URL and store the data in a list of dictionaries.

articles = [scrape_article(url) for url in urls]

## Analyzing Keyword Presence per Year

We count the number of articles mentioning "Marcos" and "Sara" per year and print the results.

In [None]:
keyword_count_per_year = defaultdict(lambda: {'Marcos': 0, 'Sara': 0})

for article in articles:
    if article:
        year = article['pub_year']
        if article['has_marcos']:
            keyword_count_per_year[year]['Marcos'] += 1
        if article['has_sara']:
            keyword_count_per_year[year]['Sara'] += 1

for year, counts in keyword_count_per_year.items():
    print(f"Year: {year}, Marcos: {counts['Marcos']}, Sara: {counts['Sara']}")


# Output:

Articles with Marcos only: 72
Articles with Sara only: 68
Articles with both Marcos and Sara: 19

Keyword counts per year:
2024: Marcos: 30, Sara: 26, Both: 13
2022: Marcos: 17, Sara: 7, Both: 2
2021: Marcos: 3, Sara: 5, Both: 2
2020: Marcos: 1, Sara: 0, Both: 0
2023: Marcos: 13, Sara: 27, Both: 2
2017: Marcos: 1, Sara: 0, Both: 0
2016: Marcos: 2, Sara: 0, Both: 0
2015: Marcos: 3, Sara: 0, Both: 0
2019: Marcos: 2, Sara: 3, Both: 0