In [None]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import time
from datetime import datetime
from IPython.display import display

def scrape_detik_articles(url):
    """
    Scrapes articles from Detik's page 1 and retrieves Title, URL, Date, and Content.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0"
    }

    # Definisikan pola untuk keyword bencana
    keywords = [
        r"\berupsi\b", r"\bgunung meletus\b", r"\btanah longsor\b", r"\blongsor\b",
        r"\bbanjir\b", r"\bbanjir bandang\b", r"\btsunami\b",
        r"\bgempa\b", r"\bgempa bumi\b", r"\bbadai\b", r"\bputing beliung\b", r"\bangin kencang\b",
        r"\bkekeringan\b", r"\bkemarau panjang\b", r"\bhujan es\b", r"\bgelombang panas\b",
        r"\bcuaca ekstrem\b", r"\bgelombang ekstrem\b", r"\bgemuruh laut\b",
        r"\bkebakaran hutan\b", r"\bkebakaran lahan\b", r"\bkarhutla\b", r"\bapi\b"
    ]
    keyword_pattern = re.compile("|".join(keywords), re.IGNORECASE)

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article")
    if not articles:
        print("No articles found.")
        return []

    news_list = []
    for article in articles:
        # Ambil link artikel
        link_tag = article.find("a", href=True)
        link = link_tag['href'] if link_tag else None

        # Abaikan URL yang mengandung "foto-news" atau "foto"
        if "foto-news" in link or "/foto/" in link:
            print(f"Skipping foto-related URL: {link}")
            continue

        # Ambil judul
        title_tag = article.find("h2", class_="title")
        title = title_tag.text.strip() if title_tag else "No title"

        print(f"Processing Article: {title}")

        # Scrap isi berita
        try:
            article_response = requests.get(link, headers=headers)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.text, "html.parser")

            # Periksa apakah artikel memiliki tag "Video News"
            video_news_tag = article_soup.find("h2", class_="detail__subtitle", string="Video News")
            if video_news_tag:
                print(f"Skipping Video News article: {link}")
                continue

            # Hapus elemen "parallaxindetail scrollpage"
            for parallax_element in article_soup.find_all("div", class_="parallaxindetail scrollpage"):
                parallax_element.decompose()

            # Hapus elemen "ADVERTISEMENT" dan "SCROLL TO CONTINUE WITH CONTENT"
            for ad_element in article_soup.find_all("span", class_="para_caption", string="ADVERTISEMENT"):
                ad_element.decompose()
            for scroll_element in article_soup.find_all("p", class_="para_caption", string="SCROLL TO CONTINUE WITH CONTENT"):
                scroll_element.decompose()

            # Ambil tanggal
            date= article_soup.find("div", class_="detail__date").get_text(strip=True)
            # Cari pola tanggal dalam teks
            # Mapping bulan dari Indonesia ke format Inggris
            bulan_mapping = {
                "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr", "Mei": "May",
                "Jun": "Jun", "Jul": "Jul", "Agu": "Aug", "Sep": "Sep", "Okt": "Oct",
                "Nov": "Nov", "Des": "Dec"
            }
            # Cari pola tanggal dalam teks
            match = re.search(r'(\d{1,2}) (\w{3}) (\d{4})', date)
            if match:
                day, month, year = match.groups()
                month = bulan_mapping.get(month, month)  # Ubah bulan jika ada di mapping

                # Konversi format dari "15 Agu 2024" menjadi "15/08/2024"
                date_obj = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
                date = date_obj.strftime("%d/%m/%Y")  # Format akhir: "15/08/2024"

            # Ambil isi konten
            paragraphs = article_soup.find_all("p")
            content = " ".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            content = re.sub(r'\s+', ' ', content).strip()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching content for article: {link}")
            content = "Failed to fetch content."

        # Filter artikel berdasarkan kata kunci di judul atau isi
        if keyword_pattern.search(title) or keyword_pattern.search(content):
            news_list.append({
                "URL": link,
                "Date": date,
                "Title": title,
                "Content": content
            })

    return news_list

def live_scrape_detik():
    """
    Runs Detik scraping for page 1 every 2 minutes and saves to Excel.
    """
    file_name = "detik_bencana_live4.xlsx"
    base_url = "https://www.detik.com/tag/bencana-alam/?sortby=time&page=1"

    all_articles = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])

    while True:
        print("\nStarting live scraping process...")
        articles = scrape_detik_articles(base_url)

        if articles:
            new_df = pd.DataFrame(articles)
            all_articles = pd.concat([all_articles, new_df], ignore_index=True).drop_duplicates(subset=["URL"])

            # Simpan hasil scraping ke Excel
            all_articles.to_excel(file_name, index=False)
            print(f"Data saved to {file_name}")

            # Tampilkan DataFrame terbaru
            print("Latest scraped articles:")
            display(all_articles)
        else:
            print("No new articles found.")

        print("Waiting for 2 minutes before the next scrape...\n")
        time.sleep(120)  # Tunggu 2 menit

# Jalankan live scrape
live_scrape_detik()


Starting live scraping process...
Processing Article: 14 Rumah Rusak Diterjang Angin Puting Beliung di Bulukumba
Processing Article: Disebut Ikan Kiamat, Benarkah Kemunculan Oarfish Jadi Tanda Buruk?
Processing Article: Angin Kencang Terjang Sidoarjo, 20 Rumah Rusak dan 3 Warga Terluka
Processing Article: Asrama Polsek-2 Rumah Warga di Banggai Rusak Diterjang Angin Puting Beliung
Processing Article: Hujan Disertai Angin Kencang Landa Tasik, Rumah Rusak Tertimpa Pohon
Processing Article: Penghuni Laut Dalam Ikan Anglerfish Muncul ke Permukaan, Apakah Akan Ada Bencana?
Skipping foto-related URL: https://news.detik.com/foto-news/d-7766191/atap-rumah-rusak-pohon-tumbang-diterjang-angin-kencang-di-ciamis
Skipping foto-related URL: https://news.detik.com/foto-news/d-7766191/atap-rumah-rusak-pohon-tumbang-diterjang-angin-kencang-di-ciamis
Skipping foto-related URL: https://www.detik.com/jatim/foto/d-7716070/plengsengan-sungai-welang-ambrol-akses-dua-dusun-ditutup
Processing Article: Deretan 

Unnamed: 0,URL,Date,Title,Content
0,https://www.detik.com/sulsel/berita/d-7786842/...,20/02/2025,14 Rumah Rusak Diterjang Angin Puting Beliung ...,Sebanyak 14 unit rumah mengalami kerusakan aki...
1,https://www.detik.com/edu/detikpedia/d-7786373...,20/02/2025,"Disebut Ikan Kiamat, Benarkah Kemunculan Oarfi...",Kemunculan ikan oarfish atau disebut ikan kiam...
2,https://www.detik.com/jatim/berita/d-7782941/a...,17/02/2025,"Angin Kencang Terjang Sidoarjo, 20 Rumah Rusak...",Hujan deras disertai angin kencang melanda Kab...
3,https://www.detik.com/sulsel/berita/d-7780631/...,16/02/2025,Asrama Polsek-2 Rumah Warga di Banggai Rusak D...,"Asrama Polsek Balantak di Kabupaten Banggai, S..."
4,https://www.detik.com/jabar/berita/d-7780166/h...,15/02/2025,"Hujan Disertai Angin Kencang Landa Tasik, Ruma...",Hujan deras disertai angin kencang melanda Kec...
5,https://www.detik.com/bali/nusra/d-7774720/fen...,12/02/2025,"Fenomena Tanah Bergerak di Sumbawa, 15 Rumah T...",Sebanyak 15 rumah warga terdampak fenomena tan...
6,https://www.detik.com/sulsel/berita/d-7772781/...,11/02/2025,Angin Puting Beliung Terjang Permukiman di Bul...,Sebanyak 15 unit rumah dan 1 masjid di Kabupat...
7,https://www.detik.com/bali/berita/d-7773092/lo...,11/02/2025,"Longsor di Perbatasan Gianyar-Klungkung, Lalu ...",Bencana tanah longsor terjadi di perbatasan Gi...


Waiting for 2 minutes before the next scrape...



In [4]:
import pandas as pd

df = pd.read_excel("detik_bencana_live4.xlsx")
df.head()

Unnamed: 0,URL,Date,Title,Content
0,https://www.detik.com/sulsel/berita/d-7786842/...,20/02/2025,14 Rumah Rusak Diterjang Angin Puting Beliung ...,Sebanyak 14 unit rumah mengalami kerusakan aki...
1,https://www.detik.com/edu/detikpedia/d-7786373...,20/02/2025,"Disebut Ikan Kiamat, Benarkah Kemunculan Oarfi...",Kemunculan ikan oarfish atau disebut ikan kiam...
2,https://www.detik.com/jatim/berita/d-7782941/a...,17/02/2025,"Angin Kencang Terjang Sidoarjo, 20 Rumah Rusak...",Hujan deras disertai angin kencang melanda Kab...
3,https://www.detik.com/sulsel/berita/d-7780631/...,16/02/2025,Asrama Polsek-2 Rumah Warga di Banggai Rusak D...,"Asrama Polsek Balantak di Kabupaten Banggai, S..."
4,https://www.detik.com/jabar/berita/d-7780166/h...,15/02/2025,"Hujan Disertai Angin Kencang Landa Tasik, Ruma...",Hujan deras disertai angin kencang melanda Kec...


In [7]:
df['Date'].dtype

dtype('O')