In [None]:
%%writefile BBC_colab.py
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from datetime import datetime, timedelta
import logging
import time
import random
import math
import re

# --- Konfigurasi ---
OUTPUT_FILENAME = 'scraped_BBC_africa.csv'
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
NEWS_PER_PAGE_BBC = 9

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
all_news_data = []

# --- Helper ---
def clean_text(text):
    return ' '.join(text.replace('\n', ' ').replace('\r', ' ').split()) if text else None

def get_current_datetime_objects():
    return datetime.now()

def format_datetime_for_storage(dt_obj):
    return dt_obj.strftime('%Y-%m-%d'), dt_obj.strftime('%H:%M:%S')

def parse_absolute_date(date_str, date_format):
    try:
        return datetime.strptime(date_str.strip(), date_format).strftime('%Y-%m-%d')
    except:
        return None

def parse_relative_date(relative_str, access_dt_obj):
    relative_str = relative_str.lower().strip()
    match = re.match(r'(\d+)\s+(hour|hr|h|minute|min|m|day|d|week|wk|w)(?:s)?\s+ago', relative_str)
    if match:
        num = int(match.group(1))
        unit = match.group(2)
        if unit in ['hour', 'hr', 'h']:
            return (access_dt_obj - timedelta(hours=num)).strftime('%Y-%m-%d')
        elif unit in ['minute', 'min', 'm']:
            return (access_dt_obj - timedelta(minutes=num)).strftime('%Y-%m-%d')
        elif unit in ['day', 'd']:
            return (access_dt_obj - timedelta(days=num)).strftime('%Y-%m-%d')
        elif unit in ['week', 'wk', 'w']:
            return (access_dt_obj - timedelta(weeks=num)).strftime('%Y-%m-%d')
    elif "yesterday" in relative_str:
        return (access_dt_obj - timedelta(days=1)).strftime('%Y-%m-%d')
    elif "today" in relative_str or "just now" in relative_str:
        return access_dt_obj.strftime('%Y-%m-%d')
    return None

async def fetch_page_content_playwright(page, url, site_name):
    logging.info(f"Fetching [{site_name}]: {url}")
    try:
        await page.goto(url, timeout=60000, wait_until='domcontentloaded')
        await page.wait_for_timeout(random.uniform(4000, 7000))
        if site_name == "BBC":
            for _ in range(2):
                await page.evaluate("window.scrollBy(0, window.innerHeight * 0.7)")
                await page.wait_for_timeout(500)
            await page.evaluate("window.scrollTo(0, 0)")
            await page.wait_for_timeout(200)
        return await page.content()
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return None

def parse_bbc(html, access_dt_obj):
    soup = BeautifulSoup(html, 'lxml')
    articles = soup.find_all('div', attrs={'data-testid': 'newport-card'})
    news_list = []
    access_date_str, access_time_str = format_datetime_for_storage(access_dt_obj)

    for article in articles:
        data = {'media': 'BBC', 'access_date': access_date_str, 'access_time': access_time_str}
        try:
            link_tag = article.find('a', attrs={'data-testid': 'internal-link'})
            data['url'] = f"https://www.bbc.com{link_tag['href']}" if link_tag and link_tag.get('href') and link_tag['href'].startswith('/') else None
            title_tag = article.find('h2', attrs={'data-testid': 'card-headline'})
            data['title'] = clean_text(title_tag.get_text()) if title_tag else None
            desc_tag = article.find('div', class_=lambda x: x and x.startswith('sc-cdecfb63-3')) or article.find('p')
            data['description'] = clean_text(desc_tag.get_text()) if desc_tag else None
            date_tag = article.find('span', attrs={'data-testid': 'card-metadata-lastupdated'})
            raw_date_text = date_tag.get_text(strip=True) if date_tag else None
            parsed_date = parse_relative_date(raw_date_text, access_dt_obj) or parse_absolute_date(raw_date_text, '%d %b %Y')
            data['date'] = parsed_date if parsed_date else raw_date_text
            if data['url'] and data['title']:
                news_list.append(data)
        except Exception as e:
            logging.error(f"Error parsing BBC article: {e}")
    return news_list

async def scrape_all(limits):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(user_agent=USER_AGENT)
        await context.route("**/*.{png,jpg,jpeg,webp,gif,css,woff,woff2}", lambda route: route.abort())
        page = await context.new_page()
        current_access_dt_obj = get_current_datetime_objects()

        bbc_target = limits.get('BBC', 0)
        if bbc_target > 0:
            logging.info(f"--- Starting BBC Scrape (Target: {bbc_target}) ---")
            bbc_base_url = "https://www.bbc.com/search?q=africa"
            collected = []

            html_content = await fetch_page_content_playwright(page, bbc_base_url, "BBC")
            if html_content:
                news_on_page = parse_bbc(html_content, current_access_dt_obj)
                collected.extend(news_on_page)
                logging.info(f"BBC Page 1: Found {len(news_on_page)}. Total: {len(collected)}")

            page_idx = 2
            max_pages = math.ceil(bbc_target / NEWS_PER_PAGE_BBC) + 5

            while len(collected) < bbc_target and page_idx <= max_pages:
                next_url = f"{bbc_base_url}&page={page_idx}"
                html_content = await fetch_page_content_playwright(page, next_url, "BBC")
                if html_content:
                    news_on_page = parse_bbc(html_content, current_access_dt_obj)
                    if not news_on_page:
                        logging.info(f"BBC Page {page_idx}: No more articles.")
                        break
                    collected.extend(news_on_page)
                    logging.info(f"BBC Page {page_idx}: Found {len(news_on_page)}. Total: {len(collected)}")
                else:
                    logging.error(f"BBC Page {page_idx}: Failed to fetch.")
                    break
                page_idx += 1

            all_news_data.extend(collected[:bbc_target])
            logging.info(f"--- Finished BBC. Collected {len(collected[:bbc_target])} articles ---")

        await browser.close()

def save_to_csv(data, filename):
    if not data:
        logging.warning("No data was scraped. CSV file will not be created.")
        return
    df = pd.DataFrame(data)
    cols = ['media', 'date', 'title', 'description', 'url', 'access_date', 'access_time']
    for col in cols:
        if col not in df.columns:
            df[col] = None
    df = df[cols]
    try:
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        logging.info(f"Successfully saved {len(df)} articles to {filename}")
    except Exception as e:
        logging.error(f"Error saving data to CSV: {e}")

# --- Eksekusi ---
if __name__ == "__main__":
    user_limits = {'BBC': 20000}
    if sum(user_limits.values()) > 0:
        asyncio.run(scrape_all(user_limits))
        save_to_csv(all_news_data, OUTPUT_FILENAME)
    else:
        logging.info("All target limits are 0. Exiting.")


Overwriting BBC_colab.py


In [None]:
!pip install playwright pandas beautifulsoup4 lxml
!playwright install
!python BBC_colab.py


╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:269:9)
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:105:5)[39m
    at async Registry._

In [None]:
from google.colab import files
files.download('scraped_BBC_africa.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>