In [None]:
import requests
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
import time as t
from datetime import datetime

In [32]:
import time as t

# Integrasi Scrape

## Fungsi Data Gabungan

In [24]:
# Fungsi pembantu untuk mengupdate file gabungan
def update_gabungan(new_articles, gabungan_file="data_gabungan.xlsx"):
    """
    Membaca file gabungan yang sudah ada (jika ada), menggabungkan dengan artikel baru,
    menghilangkan duplikasi berdasarkan URL, dan menyimpan kembali ke file gabungan.
    """
    if os.path.exists(gabungan_file):
        df_existing = pd.read_excel(gabungan_file)
    else:
        df_existing = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
        
    df_new = pd.DataFrame(new_articles)
    df_combined = pd.concat([df_new, df_existing], ignore_index=True)
    df_combined.drop_duplicates(subset=["URL"], keep="first", inplace=True)
    df_combined.to_excel(gabungan_file, index=False)
    print(f"Gabungan file ({gabungan_file}) telah diperbarui dengan {len(df_new)} artikel baru.")

## Fungsi Detik

In [36]:
def scrape_detik(file_name="data_detik.xlsx", gabungan_file="data_gabungan.xlsx"):
    """
    Scrapes articles from Detik (page 1) that match disaster-related keywords.
    Saves only new articles into an Excel file while keeping old ones, 
    and also updates the combined file.
    """
    base_url = "https://www.detik.com/tag/bencana-alam/?sortby=time&page=1"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    keywords = [
        r"\berupsi\b", r"\bgunung meletus\b", r"\btanah longsor\b", r"\blongsor\b",
        r"\bbanjir\b", r"\bbanjir bandang\b", r"\btsunami\b",
        r"\bgempa\b", r"\bgempa bumi\b", r"\bbadai\b", r"\bputing beliung\b", r"\bangin kencang\b",
        r"\bkekeringan\b", r"\bkemarau panjang\b", r"\bhujan es\b", r"\bgelombang panas\b",
        r"\bcuaca ekstrem\b", r"\bgelombang ekstrem\b", r"\bgemuruh laut\b",
        r"\bkebakaran hutan\b", r"\bkebakaran lahan\b", r"\bkarhutla\b", r"\bapi\b"
    ]
    keyword_pattern = re.compile("|".join(keywords), re.IGNORECASE)
    
    print("\nScraping Detik...")

    # Baca file Detik yang sudah ada untuk cek URL duplikat
    if os.path.exists(file_name):
        df_existing = pd.read_excel(file_name)
        existing_urls = set(df_existing["URL"].tolist())
    else:
        df_existing = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
        existing_urls = set()

    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article")
    if not articles:
        print("No articles found in Detik.")
        return

    new_articles = []
    for article in articles:
        link_tag = article.find("a", href=True)
        link = link_tag['href'] if link_tag else None
        if not link or "foto-news" in link or "/foto/" in link or link in existing_urls:
            continue

        title_tag = article.find("h2", class_="title")
        title = title_tag.text.strip() if title_tag else "No title"
        print(f"Processing: {title} (Detik)")

        try:
            article_response = requests.get(link, headers=headers)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.text, "html.parser")

            # Skip jika artikel merupakan "Video News"
            video_news_tag = article_soup.find("h2", class_="detail__subtitle", string="Video News")
            if video_news_tag:
                continue

            # Hapus elemen-elemen yang tidak diperlukan
            for tag in article_soup.find_all("div", class_="parallaxindetail scrollpage"):
                tag.decompose()
            for tag in article_soup.find_all("span", class_="para_caption", string="ADVERTISEMENT"):
                tag.decompose()
            for tag in article_soup.find_all("p", class_="para_caption", string="SCROLL TO CONTINUE WITH CONTENT"):
                tag.decompose()

            # Ambil tanggal
            date_tag = article_soup.find("div", class_="detail__date")
            date = date_tag.get_text(strip=True) if date_tag else "Unknown Date"
            bulan_mapping = {
                "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr", "Mei": "May",
                "Jun": "Jun", "Jul": "Jul", "Agu": "Aug", "Sep": "Sep", "Okt": "Oct",
                "Nov": "Nov", "Des": "Dec"
            }
            match = re.search(r'(\d{1,2}) (\w{3}) (\d{4})', date)
            if match:
                day, month, year = match.groups()
                month = bulan_mapping.get(month, month)
                date_obj = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
                date = date_obj.strftime("%d/%m/%Y")

            # Ambil konten
            paragraphs = article_soup.find_all("p")
            content = ""
            for p in paragraphs:
                p_text = ""
                previous_text = ""
                for element in p.children:
                    if element.name == "a":
                        if previous_text:
                            p_text += " "
                        p_text += element.get_text(strip=True)
                        p_text += " "
                    elif element.name is None:
                        text = element.strip()
                        if previous_text:
                            p_text += " " + text
                        else:
                            p_text += text
                    previous_text = p_text.strip()
                content += p_text.strip() + " "
            content = re.sub(r'\s+', ' ', content).strip()
        except requests.exceptions.RequestException as e:
            print(f"Error fetching article: {link} (Detik)")
            content = "Failed to fetch content."

        if keyword_pattern.search(title) or keyword_pattern.search(content):
            new_articles.append({
                "URL": link,
                "Date": date,
                "Title": title,
                "Content": content
            })

    if new_articles:
        print(f"Found {len(new_articles)} new articles in Detik. Updating files...")
        # Update file khusus Detik
        df_new = pd.DataFrame(new_articles)
        df_detik_combined = pd.concat([df_new, df_existing], ignore_index=True)
        df_detik_combined.drop_duplicates(subset=["URL"], keep="first", inplace=True)
        df_detik_combined.to_excel(file_name, index=False)
        print(f"File {file_name} updated with {len(new_articles)} new articles from Detik.")
        # Update file gabungan
        update_gabungan(new_articles, gabungan_file)
    else:
        print("No new articles found from Detik.")

## Fungsi Kompas

In [26]:
def scrape_kompas(file_name="data_kompas.xlsx", gabungan_file="data_gabungan.xlsx"):
    """
    Scrapes articles from Kompas that match disaster-related keywords.
    Saves only new articles into an Excel file while keeping old ones, 
    and also updates the combined file.
    """
    url = "https://www.kompas.com/tag/bencana?page=1"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    category_patterns = {
        "Banjir": r"\bbanjir\b|\bbanjir bandang\b",
        "Gempa Bumi": r"\bgempa\b|\bgempa bumi\b",
        "Tanah Longsor": r"\btanah longsor\b|\blongsor\b",
        "Gunung Meletus": r"\bgunung meletus\b|\berupsi\b",
        "Tsunami": r"\btsunami\b",
        "Puting Beliung": r"\bputing beliung\b|\bbadai\b",
        "Kekeringan": r"\bkekeringan\b|\bkemarau panjang\b",
        "Cuaca Ekstrem": r"\bcuaca ekstrem\b|\bhujan es\b|\bgelombang panas\b",
        "Gelombang Ekstrem": r"\bgelombang ekstrem\b|\bgemuruh laut\b",
        "Kebakaran Hutan": r"\bkebakaran hutan\b|\bapi di hutan\b|\bkarhutla\b"
    }
    keyword_pattern = re.compile("|".join(category_patterns.values()), re.IGNORECASE)
    
    print("\nScraping Kompas...")

    if os.path.exists(file_name):
        df_existing = pd.read_excel(file_name)
        existing_urls = set(df_existing["URL"].tolist())
    else:
        df_existing = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
        existing_urls = set()

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Kompas page: {e}")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all('div', class_='article__list__title')
    if not articles:
        print("No articles found in Kompas.")
        return

    new_articles = []
    for article in articles:
        try:
            title = article.find('h3', class_='article__title').get_text(strip=True)
            link = article.find('a', class_='article__link')['href']
            date = article.find_next_sibling('div', class_='article__list__info') \
                          .find('div', class_='article__date') \
                          .get_text(strip=True)
            date = date.split(',')[0]
            if link in existing_urls:
                continue
            print(f"Processing: {title} (Kompas)")
            article_response = requests.get(link, headers=headers)
            if article_response.status_code != 200:
                print(f"Failed to access article: {link}")
                continue
            article_soup = BeautifulSoup(article_response.content, 'html.parser')
            paragraphs = []
            for p in article_soup.find_all('p'):
                if p.find('a', class_='inner-link-baca-juga'):
                    continue
                if p.find_parent('div', class_='footerCopyright'):
                    continue
                if not p.has_attr('class'):
                    text = ''.join([
                        elem.strip() if isinstance(elem, str)
                        else ' ' + elem.get_text(strip=True) + ' '
                        for elem in p.contents
                    ])
                    paragraphs.append(text)
            content = ' '.join(paragraphs)
            # Hapus teks sebelum dash pertama (jika bukan di tengah kata)
            for i, c in enumerate(content):
                if c in ('-', '–'):
                    left_is_alnum = (i > 0 and content[i-1].isalnum())
                    right_is_alnum = (i < len(content) - 1 and content[i+1].isalnum())
                    if left_is_alnum and right_is_alnum:
                        continue
                    content = content[i+1:].strip()
                    break
            if keyword_pattern.search(title) or keyword_pattern.search(content):
                new_articles.append({
                    'URL': link,
                    'Date': date,
                    'Title': title,
                    'Content': content
                })
        except Exception as e:
            print(f"Error processing article: {e}")

    if new_articles:
        print(f"Found {len(new_articles)} new articles in Kompas. Updating files...")
        df_new = pd.DataFrame(new_articles)
        df_kompas_combined = pd.concat([df_new, df_existing], ignore_index=True)
        df_kompas_combined.drop_duplicates(subset=["URL"], keep="first", inplace=True)
        df_kompas_combined.to_excel(file_name, index=False)
        print(f"File {file_name} updated with {len(new_articles)} new articles from Kompas.")
        update_gabungan(new_articles, gabungan_file)
    else:
        print("No new articles found from Kompas.")

## Fungsi CNN

In [27]:
def scrape_cnn(file_name="data_cnn.xlsx", gabungan_file="data_gabungan.xlsx"):
    """
    Scrapes CNN Indonesia articles for today's date that match disaster-related keywords.
    Saves only new articles into an Excel file while keeping old ones,
    and also updates the combined file.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    keywords = [
        r"\berupsi\b", r"\bgunung meletus\b", r"\btanah longsor\b", r"\blongsor\b",
        r"\bbanjir\b", r"\bbanjir bandang\b", r"\btsunami\b",
        r"\bgempa\b", r"\bgempa bumi\b", r"\bbadai\b", r"\bputing beliung\b", r"\bangin kencang\b",
        r"\bkekeringan\b", r"\bkemarau panjang\b", r"\bhujan es\b", r"\bgelombang panas\b",
        r"\bcuaca ekstrem\b", r"\bgelombang ekstrem\b", r"\bgemuruh laut\b",
        r"\bkebakaran hutan\b", r"\bkebakaran lahan\b", r"\bkarhutla\b", r"\bapi\b"
    ]
    keyword_pattern = re.compile("|".join(keywords), re.IGNORECASE)
    base_url = "https://www.cnnindonesia.com/peristiwa/indeks/18"
    print("\nScraping CNN Indonesia...")

    if os.path.exists(file_name):
        df_existing = pd.read_excel(file_name)
        existing_urls = set(df_existing["URL"].tolist())
    else:
        df_existing = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
        existing_urls = set()

    try:
        response = requests.get(base_url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching CNN page: {e}")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="flex-grow")
    if not articles:
        print("No articles found in CNN.")
        return

    new_articles = []
    for article in articles:
        link_tag = article.find("a", href=True)
        link = link_tag['href'] if link_tag else None
        title_tag = article.find("h2", class_="text-cnn_black_light dark:text-white mb-2 inline leading-normal text-xl group-hover:text-cnn_red")
        title = title_tag.text.strip() if title_tag else "No title"
        if "FOTO" in title.upper():
            print(f"Skipping article (contains 'FOTO'): {title}")
            continue
        if not link or not link.startswith("http") or link in existing_urls:
            continue
        print(f"Processing: {title} (CNN)")
        try:
            article_response = requests.get(link, headers=headers)
            article_response.raise_for_status()
            article_soup = BeautifulSoup(article_response.text, "html.parser")
            date_element = article_soup.find("div", class_="text-cnn_grey text-sm mb-4")
            date = date_element.text.strip() if date_element else "No date found"
            bulan_mapping = {
                "Jan": "Jan", "Feb": "Feb", "Mar": "Mar", "Apr": "Apr", "Mei": "May",
                "Jun": "Jun", "Jul": "Jul", "Agu": "Aug", "Sep": "Sep", "Okt": "Oct",
                "Nov": "Nov", "Des": "Dec"
            }
            match = re.search(r'(\d{1,2}) (\w{3}) (\d{4})', date)
            if match:
                day, month, year = match.groups()
                month = bulan_mapping.get(month, month)
                date_obj = datetime.strptime(f"{day} {month} {year}", "%d %b %Y")
                date = date_obj.strftime("%d/%m/%Y")
            content_element = article_soup.find("div", class_="detail-text text-cnn_black text-sm grow min-w-0")
            if content_element:
                for ad_div in content_element.find_all("div", class_="paradetail"):
                    ad_div.decompose()
                content_paragraphs = content_element.find_all("p")
                content = ""
                for p in content_paragraphs:
                    p_text = ""
                    for element in p.children:
                        if element.name == "span" or element.name == "a":
                            p_text += " " + element.get_text(strip=True) + " "
                        elif element.name is None:
                            p_text += element.strip() + " "
                    content += p_text.strip() + " "
                content = re.sub(r'\s+', ' ', content).strip()
            else:
                content = "No content found"
            if keyword_pattern.search(title) or keyword_pattern.search(content):
                new_articles.append({
                    "URL": link,
                    "Date": date,
                    "Title": title,
                    "Content": content
                })
            else:
                print(f"Skipping article (no matching keywords): {title}")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching article {link}: {e}")

    if new_articles:
        print(f"Found {len(new_articles)} new articles in CNN. Updating files...")
        df_new = pd.DataFrame(new_articles)
        df_cnn_combined = pd.concat([df_new, df_existing], ignore_index=True)
        df_cnn_combined.drop_duplicates(subset=["URL"], keep="first", inplace=True)
        df_cnn_combined.to_excel(file_name, index=False)
        print(f"File {file_name} updated with {len(new_articles)} new articles from CNN.")
        update_gabungan(new_articles, gabungan_file)
    else:
        print("No new articles found from CNN.")

In [37]:
def run_scrape(interval=120):
    """
    Runs the scraper sequentially (Detik -> Kompas -> CNN) in a loop,
    allowing interruption with Ctrl + C.

    Args:
        interval (int): Time in seconds to wait between each scraping cycle.
    """
    try:
        while True:
            print("\n========== Starting New Scraping Cycle ==========\n")

            # Scrape masing-masing sumber berita
            scrape_detik()
            scrape_kompas()
            scrape_cnn()

            print(f"\n========== Scraping Completed. Waiting {interval // 60} minutes before next cycle ==========\n")
            t.sleep(interval)  # Wait before the next cycle

    except KeyboardInterrupt:
        print("\nScraping stopped by user. Exiting safely.")

In [38]:
run_scrape(interval=10)  # Tunggu 5 menit antar siklus




Scraping Detik...
Processing: Longsor Terjang Bungbulang Garut, 1 Orang Tertimbun (Detik)
Processing: Deretan Foto Fenomena Cuaca Mengerikan, Inikah Kiamat? (Detik)
Processing: Penghuni Laut Dalam Ikan Anglerfish Muncul ke Permukaan, Apakah Akan Ada Bencana? (Detik)
Processing: 33 Rumah di Tasikmalaya Rusak Akibat Pergeseran Tanah (Detik)
Found 1 new articles in Detik. Updating files...
File data_detik.xlsx updated with 1 new articles from Detik.
Gabungan file (data_gabungan.xlsx) telah diperbarui dengan 1 artikel baru.

Scraping Kompas...
Processing: Indonesia Rawan Bencana, Ikatan Ahli Dorong Pemerintah Bentuk UU Geologi (Kompas)
Processing: Bencana Pergeseran Tanah, 15 Rumah di Sumbawa Segera Direlokasi (Kompas)
Processing: Pergerakan Tanah Bikin Warga Cikondang Tasikmalaya Resah jika Hujan Mengguyur Desa... (Kompas)
Processing: Pergerakan Tanah di Tasikmalaya Meluas, 44 Rumah Terdampak, Retak-retak (Kompas)
Processing: Ancaman Tanah Bergerak di Pasuruan: Mengapa Relokasi Jadi So

# Text Extraction

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")




In [8]:
nltk.download('stopwords')
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Model

In [10]:
vectorizer_jenis = joblib.load('./tfidf_vectorizer_jenis.pkl')
model_jenis = joblib.load('./model_xgboost_jenis.pkl',)
vectorizer_dampak = joblib.load('./tfidf_vectorizer_dampak.pkl')
model_dampak = joblib.load('./model_xgboost_dampak.pkl',)
tokenizer = AutoTokenizer.from_pretrained('./model ner')
model = AutoModelForTokenClassification.from_pretrained('./model ner')

In [11]:
label_mapping = {
    0: 'Banjir',
    1: 'Bencana Hidrometerologi Ekstrem',
    2: 'Gempa Bumi',
    3: 'Gunung Meletus',
    4: 'Puting Beliung',
    5: 'Tanah Longsor',
    6: 'Tsunami'
}

## Fungsi Cleaning

In [12]:
# Fungsi untuk membersihkan teks pada konten
def clean_text(text):
    """
    Membersihkan teks pada kolom Content:
    - Mengubah ke huruf kecil
    - Menghapus angka
    - Menghapus stopwords
    - Menghapus spasi berlebih
    """
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def preprocess_decimal_points(text):
    """
    Mengganti titik dalam angka desimal dengan placeholder <DECIMAL>.
    Contoh: '20.46' menjadi '20<DECIMAL>46'
    """
    text = re.sub(r'(\d)\.(\d)', r'\1<DECIMAL>\2', text)
    return text

def preprocess_quoted_dots(text):
    """
    Mengganti titik dalam kalimat kutipan dengan placeholder <QUOTE_DOT>.
    Contoh: '"Informasi ini penting."' menjadi '"Informasi ini penting<QUOTE_DOT>"'
    """
    text = re.sub(r'\.(?=\")', r'<QUOTE_DOT>', text)
    return text

def preprocess_special_cases(text):
    """
    Menangani kasus khusus lainnya, seperti titik setelah singkatan dalam tanda kurung.
    Contoh: 'Senin (1/1).' menjadi 'Senin (1/1)'
    """
    text = re.sub(r'\((\d+)/(\d+)\)\.', r'(\1/\2)', text)
    return text

def preprocess_text(text):
    """
    Melakukan semua langkah preprocessing pada teks:
    - Mengganti titik dalam angka desimal dengan placeholder <DECIMAL>.
    - Mengganti titik dalam kalimat kutipan dengan placeholder <QUOTE_DOT>.
    - Menangani kasus khusus lainnya seperti titik setelah singkatan dalam tanda kurung.
    """
    text = preprocess_decimal_points(text)
    text = preprocess_quoted_dots(text)
    text = preprocess_special_cases(text)
    return text

def postprocess_decimal_points(sentences):
    """
    Mengembalikan placeholder <DECIMAL> dan <QUOTE_DOT> menjadi titik.
    """
    if isinstance(sentences, list):
        sentences = [sentence.replace('<DECIMAL>', '.') for sentence in sentences]
        sentences = [sentence.replace('<QUOTE_DOT>', '.') for sentence in sentences]
    elif isinstance(sentences, str):
        sentences = sentences.replace('<DECIMAL>', '.').replace('<QUOTE_DOT>', '.')
    else:
        raise ValueError("Input harus berupa string atau list of strings.")
    return sentences

def sentence_tokenize(text):
    """
    Tokenize kalimat menggunakan NLTK, menangani titik dalam angka desimal dan kalimat kutipan:
    - Melakukan preprocessing untuk menangani titik dalam angka desimal dan tanda baca dalam kutipan.
    - Menggunakan NLTK untuk memisahkan teks menjadi kalimat.
    - Mengembalikan titik dalam angka desimal dan tanda baca dalam kutipan setelah tokenisasi.
    """
    text = preprocess_text(text)  # Preprocessing
    sentences = sent_tokenize(text)  # Tokenisasi menggunakan NLTK
    sentences = postprocess_decimal_points(sentences)  # Postprocessing
    return sentences

def ner_with_chunking_and_cleaning(text, max_length=512):
    """
    Fungsi untuk melakukan NER pada teks panjang dengan pembagian chunk dan penanganan token hashtag (##).

    Args:
    - text (str): Teks yang akan diproses.
    - max_length (int): Batas panjang token per chunk (default: 512).

    Returns:
    - List[Dict]: Hasil NER yang sudah dibersihkan.
    """
    # Fungsi untuk membagi teks menjadi chunk
    def split_text_into_chunks(text, max_length):
        tokens = tokenizer.encode(text, truncation=False)  # Encode tanpa truncation
        chunks = []
        for i in range(0, len(tokens), max_length - 2):  # Memberi ruang untuk [CLS] dan [SEP]
            chunk = tokens[i:i + max_length - 2]
            # Menambahkan [CLS] dan [SEP] token
            chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
            chunks.append(chunk)
        return chunks

    # Fungsi untuk membersihkan hasil NER (menghapus tanda ##)
    def clean_ner_results(results):
        cleaned_results = []
        for result in results:
            word = result['word']
            # Menggabungkan token dengan ##
            if word.startswith("##"):
                if cleaned_results:
                    cleaned_results[-1]['word'] += word[2:]  # Menggabungkan ke token sebelumnya
            else:
                cleaned_results.append(result)
        return cleaned_results

    # Membuat pipeline NER
    pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    # Membagi teks menjadi chunk
    chunks = split_text_into_chunks(text, max_length)

    # Memproses setiap chunk
    all_results = []
    for chunk in chunks:
        decoded_chunk = tokenizer.decode(chunk, skip_special_tokens=True)
        ner_results = pipe(decoded_chunk)
        cleaned_chunk_results = clean_ner_results(ner_results)  # Membersihkan hashtag
        all_results.extend(cleaned_chunk_results)

    return all_results

def insert_ner_results_to_df(df, results):
    # Variabel penyimpanan
    location = []
    date = None
    time = None

    # Proses entitas
    gpe_sequence = []  # Untuk menyimpan GPE secara berurutan
    gpe_found = False
    for result in results:
        entity_group = result.get("entity_group", None)
        word = result.get("word", None)

        # Ambil lokasi (GPE) berurutan maksimal 4
        if entity_group == "GPE":
            if not gpe_found:
                gpe_found = True
            if len(gpe_sequence) < 4:  # Maksimal 4 lokasi
                gpe_sequence.append(word)
            else:
                break  # Stop setelah 4 GPE berturut-turut
        elif gpe_found:  # Reset jika bukan GPE berikutnya
            break

    # Kapitalisasi huruf awal setiap kata di lokasi
    location = ", ".join([loc.title() for loc in gpe_sequence])

    # Proses tanggal (DAT) yang muncul paling awal
    for result in results:
        if result.get("entity_group") == "DAT" and date is None:
            word = result.get("word", None)
            try:
                # Hapus nama hari dan teks tambahan dengan regex
                match = re.search(r"(\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4})", word)  # Format dengan tahun
                if match:
                    cleaned_word = match.group(1).replace(" ", "").replace("/", "/")
                    # Format tanggal menjadi seperti "27 November 2024" tanpa nama hari
                    date_obj = datetime.strptime(cleaned_word, "%d/%m/%Y")
                    date = date_obj.strftime("%d %B %Y")
                else:
                    # Format ketika tanggal tanpa tahun seperti (22/5)
                    match = re.search(r"(\d{1,2}\s*/\s*\d{1,2})", word)
                    if match:
                        cleaned_word = match.group(1).replace(" ", "").replace("/", "/")
                        current_year = datetime.now().year  # Gunakan tahun saat ini
                        cleaned_word += f"/{current_year}"
                        date_obj = datetime.strptime(cleaned_word, "%d/%m/%Y")
                        date = date_obj.strftime("%d %B")
            except ValueError:
                date = word  # Jika format tidak sesuai, simpan apa adanya

    # Proses waktu (TIM) yang muncul paling awal
    for result in results:
        if result.get("entity_group") == "TIM" and time is None:
            word = result.get("word", None)
            # Normalisasi waktu dengan regex
            match = re.search(r"(\d{1,2})\.\s*(\d{2})\s*(wib|wita|wit)?", word, re.IGNORECASE)
            if match:
                hours = match.group(1)
                minutes = match.group(2)
                timezone = match.group(3).upper() if match.group(3) else ""
                time = f"{hours}.{minutes} {timezone}".strip()
            else:
                time = word  # Simpan apa adanya jika tidak sesuai pola

    # Jika waktu tidak ditemukan, masukkan keterangan default
    if time is None:
        time = "Tidak ada dalam artikel"

    # Masukkan hasil ke DataFrame
    df.loc[len(df)] = [location, date, time]

## Prediksi Full

In [17]:
# 1. Fungsi prediksi jenis bencana
def predict_jenis(konten):
    jenis_cleaned = clean_text(konten)
    jenis_vectorized = vectorizer_jenis.transform([jenis_cleaned])
    jenis_prediction = model_jenis.predict(jenis_vectorized)
    # Konversi prediksi angka ke label string
    jenis_prediction = [label_mapping[pred] for pred in jenis_prediction]
    return jenis_prediction[0]  # Mengembalikan prediksi pertama

# 2. Fungsi NER dengan chunking dan cleaning
def ner_with_chunking_and_cleaning(text, max_length=512):
    """
    Melakukan NER pada teks panjang dengan pembagian chunk dan penanganan token hashtag (##).
    """
    def split_text_into_chunks(text, max_length):
        tokens = tokenizer.encode(text, truncation=False)
        chunks = []
        for i in range(0, len(tokens), max_length - 2):  # Ruang untuk [CLS] dan [SEP]
            chunk = tokens[i:i + max_length - 2]
            chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]
            chunks.append(chunk)
        return chunks

    def clean_ner_results(results):
        cleaned_results = []
        for result in results:
            word = result['word']
            if word.startswith("##") and cleaned_results:
                cleaned_results[-1]['word'] += word[2:]
            else:
                cleaned_results.append(result)
        return cleaned_results

    ner_pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    chunks = split_text_into_chunks(text, max_length)
    all_results = []
    for chunk in chunks:
        decoded_chunk = tokenizer.decode(chunk, skip_special_tokens=True)
        ner_results = ner_pipe(decoded_chunk)
        cleaned_chunk_results = clean_ner_results(ner_results)
        all_results.extend(cleaned_chunk_results)
    return all_results

# Fungsi untuk mengekstrak entitas NER (lokasi, tanggal, waktu)
def extract_ner_entities(results):
    location = ""
    date = None
    time = None

    # Ekstraksi lokasi (GPE) secara berurutan maksimal 4 lokasi
    gpe_sequence = []
    gpe_found = False
    for result in results:
        entity_group = result.get("entity_group")
        word = result.get("word")
        if entity_group == "GPE":
            if not gpe_found:
                gpe_found = True
            if len(gpe_sequence) < 4:
                gpe_sequence.append(word)
            else:
                break
        elif gpe_found:
            break
    location = ", ".join([loc.title() for loc in gpe_sequence])
    
    # Ekstraksi tanggal (DAT), ambil yang pertama kali muncul
    for result in results:
        if result.get("entity_group") == "DAT" and date is None:
            word = result.get("word")
            try:
                match = re.search(r"(\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4})", word)
                if match:
                    cleaned_word = match.group(1).replace(" ", "")
                    date_obj = datetime.strptime(cleaned_word, "%d/%m/%Y")
                    date = date_obj.strftime("%d %B %Y")
                else:
                    match = re.search(r"(\d{1,2}\s*/\s*\d{1,2})", word)
                    if match:
                        cleaned_word = match.group(1).replace(" ", "")
                        current_year = datetime.now().year
                        cleaned_word += f"/{current_year}"
                        date_obj = datetime.strptime(cleaned_word, "%d/%m/%Y")
                        date = date_obj.strftime("%d %B")
            except ValueError:
                date = word

    # Ekstraksi waktu (TIM), ambil yang pertama kali muncul
    for result in results:
        if result.get("entity_group") == "TIM" and time is None:
            word = result.get("word")
            match = re.search(r"(\d{1,2})\.\s*(\d{2})\s*(wib|wita|wit)?", word, re.IGNORECASE)
            if match:
                hours = match.group(1)
                minutes = match.group(2)
                timezone = match.group(3).upper() if match.group(3) else ""
                time = f"{hours}.{minutes} {timezone}".strip()
            else:
                time = word
    if time is None:
        time = "Tidak ada dalam artikel"
    return location, date, time

# 3. Fungsi prediksi kalimat dampak bencana
def predict_impact(konten):
    kalimat_list = sentence_tokenize(konten)
    dampak_vectorized = vectorizer_dampak.transform(kalimat_list)
    dampak_pred = model_dampak.predict(dampak_vectorized)
    kalimat_berdampak = [kalimat for kalimat, label in zip(kalimat_list, dampak_pred) if label == 1]
    return " ".join(kalimat_berdampak)

In [42]:
# Baca file data_prediksi.xlsx jika ada, jika tidak buat DataFrame baru
if os.path.exists("data_prediksi.xlsx"):
    df_prediksi = pd.read_excel("data_prediksi.xlsx")
else:
    df_prediksi = pd.DataFrame(columns=["URL", "Type", "Location", "Date", "Time", "Impact"])

# Baca file data_gabungan.xlsx jika ada, jika tidak buat DataFrame baru
if os.path.exists("data_gabungan.xlsx"):
    df_gabungan = pd.read_excel("data_gabungan.xlsx")
else:
    df_gabungan = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])

# Iterasi untuk tiap baris di df_gabungan
for index, row in df_gabungan.iterrows():
    url = row.get("URL", None)
    
    # Cek apakah URL sudah pernah diprediksi sebelumnya
    if url is not None and url in df_prediksi['URL'].values:
        continue  # Lewati baris jika URL sudah ada
    
    konten = row['Content']
    
    # 1. Prediksi jenis bencana
    pred_type = predict_jenis(konten)
    
    # 2. Ekstraksi entitas NER: lokasi, tanggal, waktu
    ner_results = ner_with_chunking_and_cleaning(konten)
    location, date, time = extract_ner_entities(ner_results)
    
    # 3. Prediksi kalimat dampak bencana
    impact_text = predict_impact(konten)
    
    # Memasukkan hasil ke df_prediksi
    new_row = {
        "URL": url,
        "Type": pred_type,
        "Location": location,
        "Date": date,
        "Time": time,
        "Impact": impact_text
    }
    df_prediksi = df_prediksi.append(new_row, ignore_index=True)
    
    # Simpan pembaruan ke file Excel
    df_prediksi.to_excel("data_prediksi.xlsx", index=False)

Device set to use cpu
Device set to use cpu


In [45]:
df_prediksi

Unnamed: 0,URL,Type,Location,Date,Time,Impact
0,https://www.detik.com/jabar/berita/d-7791405/k...,Banjir,"Kecamatan Cijeungjing, Kabupaten Ciamis",22 February 2025,Tidak ada dalam artikel,Sebanyak 7 bangunan terdiri dari rumah warga d...
1,https://www.detik.com/sulsel/berita/d-7786842/...,Puting Beliung,"Kabupaten Bulukumba, Sulawesi Selatan, Sulsel",20 February 2025,14.30 WITA,Sebanyak 14 unit rumah mengalami kerusakan aki...
2,https://www.detik.com/edu/detikpedia/d-7786373...,Gempa Bumi,Jepang,,Tidak ada dalam artikel,
3,https://www.detik.com/jatim/berita/d-7782941/a...,Puting Beliung,Kabupaten Sidoarjo,17 February 2025,14.30 WIB,"Akibatnya, puluhan rumah warga mengalami kerus..."
4,https://www.detik.com/sulsel/berita/d-7780631/...,Puting Beliung,"Kabupaten Banggai, Sulawesi Tengah, Sulteng",16 February 2025,Tidak ada dalam artikel,"Asrama Polsek Balantak di Kabupaten Banggai, S..."
5,https://www.detik.com/jabar/berita/d-7780166/h...,Puting Beliung,"Kecamatan Sukaraja, Kabupaten Tasikmalaya, Jaw...",15 February,Tidak ada dalam artikel,Di Desa Leuwibudah dua rumah tertimpa pohon se...
6,https://www.detik.com/bali/nusra/d-7774720/fen...,Tanah Longsor,"Desa Tangkampulit, Kecamatan Batulanteh, Kabup...",12 February 2025,11.00 WITA,Sebanyak 15 rumah warga terdampak fenomena tan...
7,https://www.detik.com/sulsel/berita/d-7772781/...,Puting Beliung,"Kabupaten Bulukumba, Sulawesi Selatan, Sulsel",11 February 2025,14.30 WITA,Sebanyak 15 unit rumah dan 1 masjid di Kabupat...
8,http://regional.kompas.com/read/2025/02/20/171...,Gunung Meletus,"Flores Timur, Nusa Tenggara Timur, Ntt",21 December 2024,Tidak ada dalam artikel,
9,http://surabaya.kompas.com/read/2025/02/20/110...,Gunung Meletus,"Kabupaten Lumajang, Jawa Timur",20 February 2025,00.00 WIB,Sebanyak empat kali erupsi terpantau jelas sec...


# Integrasi Scrape dan Prediksi

In [46]:
def run_scrape_predict(interval=120):
    """
    Menjalankan proses scraping secara berurutan (Detik -> Kompas -> CNN) dalam satu siklus,
    kemudian melakukan prediksi pada artikel baru yang ada di data_gabungan.xlsx,
    dan menyimpan hasil prediksi ke data_prediksi.xlsx.
    
    Args:
        interval (int): Waktu tunggu (dalam detik) antara siklus.
    """
    try:
        while True:
            print("\n========== Starting New Scraping Cycle ==========\n")
            
            # Jalankan scraping dari masing-masing sumber berita
            scrape_detik()   # Fungsi ini meng-update file data_detik.xlsx dan data_gabungan.xlsx
            scrape_kompas()  # Fungsi ini meng-update file data_kompas.xlsx dan data_gabungan.xlsx
            scrape_cnn()     # Fungsi ini meng-update file data_cnn.xlsx dan data_gabungan.xlsx
            
            print("\n========== Scraping Completed. Starting Prediction Process ==========\n")
            
            # Muat ulang file data_gabungan.xlsx (mungkin telah terupdate dengan artikel baru)
            if os.path.exists("data_gabungan.xlsx"):
                df_gabungan = pd.read_excel("data_gabungan.xlsx")
            else:
                df_gabungan = pd.DataFrame(columns=["URL", "Date", "Title", "Content"])
            
            # Baca atau inisialisasi file data_prediksi.xlsx
            if os.path.exists("data_prediksi.xlsx"):
                df_prediksi = pd.read_excel("data_prediksi.xlsx")
            else:
                df_prediksi = pd.DataFrame(columns=["URL", "Type", "Location", "Date", "Time", "Impact"])
            
            # Iterasi untuk tiap baris di df_gabungan dan lakukan prediksi jika URL belum ada di df_prediksi
            for index, row in df_gabungan.iterrows():
                url = row.get("URL", None)
                if url is not None and url in df_prediksi['URL'].values:
                    continue  # Lewati jika URL sudah diprediksi
                
                konten = row['Content']
                
                # 1. Prediksi jenis bencana
                pred_type = predict_jenis(konten)
                
                # 2. Ekstraksi entitas NER: lokasi, tanggal, waktu
                ner_results = ner_with_chunking_and_cleaning(konten)
                location, date, time_extracted = extract_ner_entities(ner_results)
                
                # 3. Prediksi kalimat dampak bencana
                impact_text = predict_impact(konten)
                
                # Masukkan hasil prediksi ke df_prediksi
                new_row = {
                    "URL": url,
                    "Type": pred_type,
                    "Location": location,
                    "Date": date,
                    "Time": time_extracted,
                    "Impact": impact_text
                }
                df_prediksi = df_prediksi.append(new_row, ignore_index=True)
                
                # Simpan pembaruan ke file Excel
                df_prediksi.to_excel("data_prediksi.xlsx", index=False)
            
            print("Prediction process completed. Data prediksi telah diperbarui.")
            print(f"\n========== Cycle Completed. Waiting {interval // 60} minutes before next cycle ==========\n")
            t.sleep(interval)
    except KeyboardInterrupt:
        print("\nScraping and prediction stopped by user. Exiting safely.")

In [48]:
run_scrape_predict(interval=10)




Scraping Detik...
Processing: Longsor Terjang Bungbulang Garut, 1 Orang Tertimbun (Detik)
Processing: Deretan Foto Fenomena Cuaca Mengerikan, Inikah Kiamat? (Detik)
Processing: Penghuni Laut Dalam Ikan Anglerfish Muncul ke Permukaan, Apakah Akan Ada Bencana? (Detik)
Processing: 33 Rumah di Tasikmalaya Rusak Akibat Pergeseran Tanah (Detik)
Found 1 new articles in Detik. Updating files...
File data_detik.xlsx updated with 1 new articles from Detik.
Gabungan file (data_gabungan.xlsx) telah diperbarui dengan 1 artikel baru.

Scraping Kompas...
Processing: Indonesia Rawan Bencana, Ikatan Ahli Dorong Pemerintah Bentuk UU Geologi (Kompas)
Processing: Bencana Pergeseran Tanah, 15 Rumah di Sumbawa Segera Direlokasi (Kompas)
Processing: Pergerakan Tanah Bikin Warga Cikondang Tasikmalaya Resah jika Hujan Mengguyur Desa... (Kompas)
Processing: Pergerakan Tanah di Tasikmalaya Meluas, 44 Rumah Terdampak, Retak-retak (Kompas)
Processing: Ancaman Tanah Bergerak di Pasuruan: Mengapa Relokasi Jadi So

Device set to use cpu


Skipping article (no matching keywords): Wagub Rano Ungkap Wacana Bangun Pengolahan Sampah RDF Plant Lagi
Found 1 new articles in CNN. Updating files...
File data_cnn.xlsx updated with 1 new articles from CNN.
Gabungan file (data_gabungan.xlsx) telah diperbarui dengan 1 artikel baru.




Device set to use cpu
Device set to use cpu


Prediction process completed. Data prediksi telah diperbarui.





Scraping Detik...
Processing: Deretan Foto Fenomena Cuaca Mengerikan, Inikah Kiamat? (Detik)
Processing: Penghuni Laut Dalam Ikan Anglerfish Muncul ke Permukaan, Apakah Akan Ada Bencana? (Detik)
Processing: 33 Rumah di Tasikmalaya Rusak Akibat Pergeseran Tanah (Detik)
No new articles found from Detik.

Scraping Kompas...
Processing: Indonesia Rawan Bencana, Ikatan Ahli Dorong Pemerintah Bentuk UU Geologi (Kompas)
Processing: Bencana Pergeseran Tanah, 15 Rumah di Sumbawa Segera Direlokasi (Kompas)
Processing: Pergerakan Tanah Bikin Warga Cikondang Tasikmalaya Resah jika Hujan Mengguyur Desa... (Kompas)
Processing: Pergerakan Tanah di Tasikmalaya Meluas, 44 Rumah Terdampak, Retak-retak (Kompas)
Processing: Ancaman Tanah Bergerak di Pasuruan: Mengapa Relokasi Jadi Solusi Terbaik? (Kompas)
Processing: Puluhan Rumah di Tasikmalaya Terdampak Pergerakan Tanah, BPBD Lapor Tim Geologi Bandung (Kompas)
Processing: Efisiensi Angga

In [49]:
df_prediksi

Unnamed: 0,URL,Type,Location,Date,Time,Impact
0,https://www.detik.com/jabar/berita/d-7791405/k...,Banjir,"Kecamatan Cijeungjing, Kabupaten Ciamis",22 February 2025,Tidak ada dalam artikel,Sebanyak 7 bangunan terdiri dari rumah warga d...
1,https://www.detik.com/sulsel/berita/d-7786842/...,Puting Beliung,"Kabupaten Bulukumba, Sulawesi Selatan, Sulsel",20 February 2025,14.30 WITA,Sebanyak 14 unit rumah mengalami kerusakan aki...
2,https://www.detik.com/edu/detikpedia/d-7786373...,Gempa Bumi,Jepang,,Tidak ada dalam artikel,
3,https://www.detik.com/jatim/berita/d-7782941/a...,Puting Beliung,Kabupaten Sidoarjo,17 February 2025,14.30 WIB,"Akibatnya, puluhan rumah warga mengalami kerus..."
4,https://www.detik.com/sulsel/berita/d-7780631/...,Puting Beliung,"Kabupaten Banggai, Sulawesi Tengah, Sulteng",16 February 2025,Tidak ada dalam artikel,"Asrama Polsek Balantak di Kabupaten Banggai, S..."
5,https://www.detik.com/jabar/berita/d-7780166/h...,Puting Beliung,"Kecamatan Sukaraja, Kabupaten Tasikmalaya, Jaw...",15 February,Tidak ada dalam artikel,Di Desa Leuwibudah dua rumah tertimpa pohon se...
6,https://www.detik.com/bali/nusra/d-7774720/fen...,Tanah Longsor,"Desa Tangkampulit, Kecamatan Batulanteh, Kabup...",12 February 2025,11.00 WITA,Sebanyak 15 rumah warga terdampak fenomena tan...
7,https://www.detik.com/sulsel/berita/d-7772781/...,Puting Beliung,"Kabupaten Bulukumba, Sulawesi Selatan, Sulsel",11 February 2025,14.30 WITA,Sebanyak 15 unit rumah dan 1 masjid di Kabupat...
8,http://regional.kompas.com/read/2025/02/20/171...,Gunung Meletus,"Flores Timur, Nusa Tenggara Timur, Ntt",21 December 2024,Tidak ada dalam artikel,
9,http://surabaya.kompas.com/read/2025/02/20/110...,Gunung Meletus,"Kabupaten Lumajang, Jawa Timur",20 February 2025,00.00 WIB,Sebanyak empat kali erupsi terpantau jelas sec...
