In [None]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from IPython.display import display
import time

def scrape_cnn_today(keywords):
    """
    Scrapes CNN Indonesia articles for today's date and returns a list of articles.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0"
    }
    keyword_pattern = re.compile("|".join(keywords), re.IGNORECASE)
    today_date = datetime.now().strftime("%Y/%m/%d")
    base_url = f"https://www.cnnindonesia.com/peristiwa/indeks/18?"

    news_list = []

    print(f"Scraping articles for date: {today_date}")
    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching page: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="flex-grow")

    for article in articles:
        link_tag = article.find("a", href=True)
        link = link_tag['href'] if link_tag else None
        title_tag = article.find("h2", class_="text-cnn_black_light dark:text-white mb-2 inline leading-normal text-xl group-hover:text-cnn_red")
        title = title_tag.text.strip() if title_tag else "No title"

        # Skip jika link tidak valid
        if not link or not link.startswith("http"):
            continue

        print(f"Processing Article: {title}")

        try:
            article_response = requests.get(link, headers=headers)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.text, "html.parser")

            # Ambil tanggal
            date_element = article_soup.find("div", class_="text-cnn_grey text-sm mb-4")
            date = date_element.text.strip() if date_element else "No date found"
            bulan_mapping = {
                "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr", "Mei": "May",
                "Jun": "Jun", "Jul": "Jul", "Agu": "Aug", "Sep": "Sep", "Okt": "Oct",
                "Nov": "Nov", "Des": "Dec"
            }
            # Cari pola tanggal dalam teks
            match = re.search(r'(\d{1,2}) (\w{3}) (\d{4})', date)
            if match:
                day, month, year = match.groups()
                month = bulan_mapping.get(month, month)  # Ubah bulan jika ada di mapping

                # Konversi format dari "15 Agu 2024" menjadi "15/08/2024"
                date_obj = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
                date = date_obj.strftime("%d/%m/%Y")  # Format akhir: "15/08/2024"

            # Ambil konten
            content_element = article_soup.find("div", class_="detail-text text-cnn_black text-sm grow min-w-0")
            if content_element:
                # Hapus elemen <div class="paradetail"> (iklan)
                for ad_div in content_element.find_all("div", class_="paradetail"):
                    ad_div.decompose()
                
                content_paragraphs = content_element.find_all("p")
                content = ""
                for p in content_paragraphs:
                    p_text = ""

                    # Memproses elemen di dalam <p>, termasuk <span> dan <a>
                    for element in p.children:
                        if element.name == "span" or element.name == "a":
                            p_text += " " + element.get_text(strip=True) + " "
                        elif element.name is None:
                            p_text += element.strip() + " "
                    
                    content += p_text.strip() + " "
                content = re.sub(r'\s+', ' ', content).strip()
            else:
                content = "No content found"

            # Filter berdasarkan kata kunci
            if keyword_pattern.search(title) or keyword_pattern.search(content):
                news_list.append({
                    "URL": link,
                    "Date": date,
                    "Title": title,
                    "Content": content
                })
            else:
                print(f"Skipping article (no matching keywords): {title}")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching article {link}: {e}")
    
    return news_list

def live_scrape_cnn_today():
    """
    Runs CNN scraping for today every 2 minutes, displays, and saves the articles to Excel.
    """
    keywords = [
        r"\berupsi\b", r"\bgunung meletus\b", r"\btanah longsor\b", r"\blongsor\b",
        r"\bbanjir\b", r"\bbanjir bandang\b", r"\btsunami\b",
        r"\bgempa\b", r"\bgempa bumi\b", r"\bbadai\b", r"\bputing beliung\b", r"\bangin kencang\b",
        r"\bkekeringan\b", r"\bkemarau panjang\b", r"\bhujan es\b", r"\bgelombang panas\b",
        r"\bcuaca ekstrem\b", r"\bgelombang ekstrem\b", r"\bgemuruh laut\b",
        r"\bkebakaran hutan\b", r"\bkebakaran lahan\b", r"\bkarhutla\b", r"\bapi\b"
    ]
    
    all_articles = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
    file_name = "cnn_live_scrape_today.xlsx"

    while True:
        print("\nStarting live scrape for today's articles...")
        articles = scrape_cnn_today(keywords)
        
        if articles:
            new_df = pd.DataFrame(articles)

            # Gabungkan dengan data sebelumnya
            all_articles = pd.concat([all_articles, new_df], ignore_index=True).drop_duplicates(subset=["URL"])

            # Tampilkan data terbaru
            display(all_articles)

            # Simpan ke Excel
            all_articles.to_excel(file_name, index=False)
            print(f"Saved to {file_name}.")
        else:
            print("No new articles found.")

        # Tunggu 2 menit sebelum scrape berikutnya
        print("Waiting for 2 minutes before the next scrape...\n")
        time.sleep(120)

# Jalankan live scrape
live_scrape_cnn_today()


Starting live scrape for today's articles...
Scraping articles for date: 2025/02/23
Processing Article: Polisi Jamin Keamanan Band Sukatani Manggung di Tegal Hari Ini
Skipping article (no matching keywords): Polisi Jamin Keamanan Band Sukatani Manggung di Tegal Hari Ini
Processing Article: Belum Ikut Retret, Kepala Daerah PDIP Standby di Magelang
Skipping article (no matching keywords): Belum Ikut Retret, Kepala Daerah PDIP Standby di Magelang
Processing Article: Bupati Purbalingga Tawarkan Vokalis Sukatani Jadi Guru Lagi
Skipping article (no matching keywords): Bupati Purbalingga Tawarkan Vokalis Sukatani Jadi Guru Lagi
Processing Article: Korban Bom Bali Takut Bantuan Negara via LPSK Kena Efisiensi Anggaran
Skipping article (no matching keywords): Korban Bom Bali Takut Bantuan Negara via LPSK Kena Efisiensi Anggaran
Processing Article: 3 Koper Wabup Deli Serdang Rusak di Bandara Usai Pulang Pelantikan
Skipping article (no matching keywords): 3 Koper Wabup Deli Serdang Rusak di Banda

Unnamed: 0,URL,Date,Title,Content
0,https://www.cnnindonesia.com/nasional/20250223...,23/02/2025,"Gunung Semeru Erupsi Sabtu Malam, Tinggi Letus...",Gunung Semeru yang terletak di Lumajang dan Ma...
1,https://www.cnnindonesia.com/nasional/20250222...,22/02/2025,1.524 Rumah Warga di Bandar Lampung Terendam B...,Setidaknya sebanyak 1.524 rumah warga di sembi...


Saved to cnn_live_scrape_today.xlsx.
Waiting for 2 minutes before the next scrape...

