In [None]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import os
from IPython.display import display

def live_scrape_kompas(file_name="kompas_bencana4.xlsx", interval=300):
    """
    Fungsi utama untuk melakukan scraping artikel bencana dari halaman Kompas dan menyimpan hasilnya ke file Excel.
    Fungsi ini berjalan secara live dengan interval waktu yang ditentukan.
    
    Args:
    - file_name (str): Nama file Excel untuk menyimpan hasil scraping.
    - interval (int): Waktu tunggu antar scraping dalam detik (default: 300 detik atau 5 menit).
    """
    # Base URL
    url = "https://www.kompas.com/tag/bencana?page=1"

    # Header untuk request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    # Pola kategori
    category_patterns = {
        "Banjir": r"\bbanjir\b|\bbanjir bandang\b",
        "Gempa Bumi": r"\bgempa\b|\bgempa bumi\b",
        "Tanah Longsor": r"\btanah longsor\b|\blongsor\b",
        "Gunung Meletus": r"\bgunung meletus\b|\berupsi\b",
        "Tsunami": r"\btsunami\b",
        "Puting Beliung": r"\bputing beliung\b|\bbadai\b",
        "Kekeringan": r"\bkekeringan\b|\bkemarau panjang\b",
        "Cuaca Ekstrem": r"\bcuaca ekstrem\b|\bhujan es\b|\bgelombang panas\b",
        "Gelombang Ekstrem": r"\bgelombang ekstrem\b|\bgemuruh laut\b",
        "Kebakaran Hutan": r"\bkebakaran hutan\b|\bapi di hutan\b|\bkarhutla\b"
    }

    # Gabungkan semua pola menjadi satu regex
    keyword_pattern = re.compile("|".join(category_patterns.values()), re.IGNORECASE)

    def scrape_page():
        """Scrapes the first page of Kompas tag 'bencana' for articles."""
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Gagal mengakses halaman: {e}")
            return []

        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.find_all('div', class_='article__list__title')
        scraped_data = []

        for article in articles:
            try:
                # Judul dan URL
                title = article.find('h3', class_='article__title').get_text(strip=True)
                link = article.find('a', class_='article__link')['href']
                date = article.find_next_sibling('div', class_='article__list__info').find('div', class_='article__date').get_text(strip=True)
                date = date.split(',')[0]  # Ambil bagian sebelum koma

                # Request konten artikel
                article_response = requests.get(link, headers=headers)
                if article_response.status_code != 200:
                    print(f"Gagal mengakses artikel: {link}")
                    continue

                article_soup = BeautifulSoup(article_response.content, 'html.parser')

                # Ambil semua paragraf
                paragraphs = []
                for p in article_soup.find_all('p'):
                    if p.find_parent('div', class_='footerCopyright'):
                        continue
                    if not p.has_attr('class'):
                        text = ''.join([elem.strip() if isinstance(elem, str) else ' ' + elem.get_text(strip=True) + ' ' for elem in p.contents])
                        paragraphs.append(text)
                content = ' '.join(paragraphs)
                content = re.sub(r'^(Tim Redaksi -|Editor -)\s*', '', content)

                # Filter hanya artikel dengan kategori
                if keyword_pattern.search(title) or keyword_pattern.search(content):
                    category = "Lainnya"
                    for cat, pattern in category_patterns.items():
                        if re.search(pattern, content, re.IGNORECASE) or re.search(pattern, title, re.IGNORECASE):
                            category = cat
                            break

                    scraped_data.append({'URL': link, 'Date': date, 'Title': title, 'Content': content})
            except Exception as e:
                print(f"Error pada artikel: {e}")
        return scraped_data

    # Loop scraping secara berkala
    while True:
        print("Starting scraping process...")
        scraped_articles = scrape_page()

        if scraped_articles:
            if os.path.exists(file_name):
                df_existing = pd.read_excel(file_name)
                existing_urls = set(df_existing['URL'])
            else:
                df_existing = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
                existing_urls = set()

            new_articles = [article for article in scraped_articles if article['URL'] not in existing_urls]
            if new_articles:
                print(f"Found {len(new_articles)} new articles. Saving to {file_name}...")
                df_new = pd.DataFrame(new_articles)
                df_combined = pd.concat([df_existing, df_new], ignore_index=True)
                df_combined.to_excel(file_name, index=False)
                print(f"File updated with new articles.")
            else:
                print("No new articles found.")
        else:
            print("No articles found during scraping.")

        # Menampilkan data terbaru di tabel
        print("Displaying the latest scraped articles:")
        df_sorted = pd.read_excel(file_name).sort_values(by="Date", ascending=False)
        display(df_sorted.head())

        print(f"Waiting for {interval / 60} minutes before the next run...")
        time.sleep(interval)

# Jalankan fungsi
live_scrape_kompas()

Starting scraping process...
Found 1 new articles. Saving to kompas_bencana4.xlsx...
File updated with new articles.
Displaying the latest scraped articles:


Unnamed: 0,URL,Date,Title,Content
7,http://surabaya.kompas.com/read/2025/02/20/110...,20/02/2025,"Gunung Semeru Erupsi 5 Kali, Letusan Asap Capa...","Tim Redaksi LUMAJANG, KOMPAS.com - Gunung Sem..."
0,http://denpasar.kompas.com/read/2025/02/19/160...,19/02/2025,Ratusan Keluarga di Buleleng Kesulitan Air Ber...,"Tim Redaksi BULELENG, KOMPAS.com - Ratusan ke..."
1,http://regional.kompas.com/read/2025/02/18/201...,18/02/2025,"Tunggu 2 Bulan, Warga 4 Desa Terdampak Erupsi ...","Tim Redaksi FLORES TIMUR, KOMPAS.com – Penjab..."
2,http://regional.kompas.com/read/2025/02/14/170...,14/02/2025,Korupsi Bansos Korban Banjir Bandang di Lembat...,"Tim Redaksi LEMBATA, KOMPAS.com – Kejaksaan N..."
3,http://money.kompas.com/read/2025/02/14/073000...,14/02/2025,PetroChina Bantu Penanggulangan Bencana di Sek...,"Editor JAKARTA, KOMPAS.com – PetroChina Inter..."


Waiting for 5.0 minutes before the next run...


In [2]:
import pandas as pd

df = pd.read_excel("kompas_bencana4.xlsx")
df.head()

Unnamed: 0,URL,Date,Title,Content
0,http://denpasar.kompas.com/read/2025/02/19/160...,19/02/2025,Ratusan Keluarga di Buleleng Kesulitan Air Ber...,"Tim Redaksi BULELENG, KOMPAS.com - Ratusan ke..."
1,http://regional.kompas.com/read/2025/02/18/201...,18/02/2025,"Tunggu 2 Bulan, Warga 4 Desa Terdampak Erupsi ...","Tim Redaksi FLORES TIMUR, KOMPAS.com – Penjab..."
2,http://regional.kompas.com/read/2025/02/14/170...,14/02/2025,Korupsi Bansos Korban Banjir Bandang di Lembat...,"Tim Redaksi LEMBATA, KOMPAS.com – Kejaksaan N..."
3,http://money.kompas.com/read/2025/02/14/073000...,14/02/2025,PetroChina Bantu Penanggulangan Bencana di Sek...,"Editor JAKARTA, KOMPAS.com – PetroChina Inter..."
4,http://regional.kompas.com/read/2025/02/13/151...,13/02/2025,"Buaya 4 Meter Muncul Saat Banjir di Makassar, ...","Tim Redaksi MAKASSAR, KOMPAS.com - Seekor bua..."


In [6]:
df['Date'].dtype

dtype('O')