## **Crawling**

### Gamebrott

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re 

from bs4 import XMLParsedAsHTMLWarning
import warnings

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# --- Konfigurasi ---
SITEMAP_INDEX_URL = "https://gamebrott.com/sitemap_index.xml"
URL_OUTPUT_FILE = "../data/crawled_urls.txt" 
LOG_FILE = "../data/crawl_logs.txt"
PORTAL_NAME = "Gamebrott"

output_dir = os.path.dirname(URL_OUTPUT_FILE)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_to_file(message):
    """Fungsi untuk menulis pesan log ke file dengan timestamp."""
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry.strip())
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry)

def get_sitemap_number(url):
    """Mengekstrak nomor dari URL sitemap untuk sorting yang benar."""
    match = re.search(r'post-sitemap(\d+)\.xml', url)
    return int(match.group(1)) if match else 0

def crawl_xml_sitemap():
    """
    Fungsi utama untuk crawling sitemap XML, dengan logging dan penyimpanan append.
    """
    log_to_file(f"===== Memulai Proses Crawling {PORTAL_NAME} =====")
    
    newly_found_urls = []
    
    try:
        log_to_file(f"Mengambil sitemap index dari: {SITEMAP_INDEX_URL}")
        response_main = requests.get(SITEMAP_INDEX_URL, headers=HEADERS, timeout=15)
        response_main.raise_for_status()

        # Gunakan "lxml-xml" atau "xml" dengan lxml sebagai parser
        # Jika masih error, gunakan "html.parser" sebagai fallback
        try:
            soup_main = BeautifulSoup(response_main.content, "lxml-xml")
        except:
            log_to_file("  -> Mencoba parser alternatif...")
            soup_main = BeautifulSoup(response_main.content, "html.parser")
        
        all_sitemap_links = [loc.text for loc in soup_main.find_all('loc')]
        
        if not all_sitemap_links:
            log_to_file("ERROR: Tidak ada tag <loc> yang ditemukan di sitemap index.")
            return

        log_to_file(f"Total sitemap ditemukan di index: {len(all_sitemap_links)}")
        
        post_sitemap_links = [url for url in all_sitemap_links if 'post-sitemap' in url]
        log_to_file(f"Menyaring sitemap... Ditemukan {len(post_sitemap_links)} link yang mengandung 'post-sitemap'.")

        post_sitemap_links.sort(key=get_sitemap_number)
        
        sitemaps_to_process = post_sitemap_links[:3]
        log_to_file(f"Membatasi proses hanya untuk {len(sitemaps_to_process)} sitemap pertama.")
        
        for i, sitemap_url in enumerate(sitemaps_to_process, 1):
            log_to_file(f"({i}/{len(sitemaps_to_process)}) Memproses sitemap: {sitemap_url}")
            try:
                response_post = requests.get(sitemap_url, headers=HEADERS, timeout=15)
                response_post.raise_for_status()

                # Gunakan parser yang sama
                try:
                    soup_post = BeautifulSoup(response_post.content, "lxml-xml")
                except:
                    soup_post = BeautifulSoup(response_post.content, "html.parser")
                
                article_links_in_page = []
                url_blocks = soup_post.find_all('url')
                
                for block in url_blocks:
                    loc_tag = block.find('loc')
                    if loc_tag:
                        article_links_in_page.append(loc_tag.text)
                
                log_to_file(f"  -> Ditemukan {len(article_links_in_page)} link artikel (gambar diabaikan).")
                newly_found_urls.extend(article_links_in_page)

            except requests.exceptions.RequestException as e:
                log_to_file(f"  -> ERROR: Gagal mengambil atau memproses {sitemap_url}: {e}")
                continue

    except requests.exceptions.RequestException as e:
        log_to_file(f"FATAL ERROR: Gagal mengambil sitemap index utama. Proses dihentikan. Error: {e}")
        return

    if newly_found_urls:
        log_to_file(f"Menyimpan {len(newly_found_urls)} link baru ke file {URL_OUTPUT_FILE}...")
        try:
            with open(URL_OUTPUT_FILE, 'a', encoding='utf-8') as f:
                for url in newly_found_urls:
                    # Menggunakan variabel PORTAL_NAME
                    f.write(f"{PORTAL_NAME};{url}\n")
            log_to_file("Penyimpanan link berhasil.")
        except IOError as e:
            log_to_file(f"ERROR: Gagal menulis ke file {URL_OUTPUT_FILE}: {e}")
    else:
        log_to_file("Tidak ada link baru yang ditemukan pada sesi crawling ini.")
    
    log_to_file(f"Total link yang didapat pada sesi ini: {len(newly_found_urls)}")
    log_to_file("===== Proses Crawling Selesai =====\n")


if __name__ == "__main__":
    crawl_xml_sitemap()

### Kotakgames

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

START_YEAR = 2025
END_YEAR = 2025  # Sama dengan START_YEAR untuk hanya mengambil tahun 2025
MAX_LINKS = 500

# --- Konfigurasi ---
URL_OUTPUT_FILE = "../data/crawled_urls.txt"
LOG_FILE = "../data/crawl_logs.txt"
PORTAL_NAME = "Kotakgame"

output_dir = os.path.dirname(URL_OUTPUT_FILE)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_to_file(message):
    # Gunakan format yang kompatibel dengan Windows dan Linux
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry.strip())
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry)

def get_last_page(year_base_url):
    """Fungsi ini sekarang menerima base URL untuk tahun tertentu."""
    first_page_url = year_base_url + "1/"
    try:
        log_to_file(f"  -> Mencari halaman terakhir dari: {first_page_url}")
        response = requests.get(first_page_url, headers=HEADERS, timeout=15, verify=False)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, "html.parser")
        last_page_link = soup.find('a', class_='prevnext', text='LAST ¬ª')
        
        if last_page_link and last_page_link.has_attr('href'):
            href = last_page_link['href']
            page_number = int(href.strip('/').split('/')[-1])
            log_to_file(f"  -> Halaman terakhir ditemukan: {page_number}")
            return page_number
        else:
            log_to_file("  -> WARNING: Link 'LAST ¬ª' tidak ditemukan. Mengasumsikan hanya ada 1 halaman.")
            return 1
            
    except requests.exceptions.RequestException as e:
        log_to_file(f"  -> ERROR: Tidak bisa mengakses halaman pertama untuk tahun ini. Error: {e}")
        return 0 # Mengembalikan 0 jika gagal, untuk dilewati
    except (ValueError, IndexError):
        log_to_file("  -> FATAL ERROR: Gagal mem-parsing nomor halaman terakhir dari link.")
        return 0

def crawl_kotakgame_multi_year(max_links=None):
    """
    Crawl artikel dari Kotakgame.com
    
    Args:
        max_links (int, optional): Jumlah maksimal link yang ingin diambil.
                                   Jika None, akan mengambil semua link.
    """
    log_to_file(f"===== Memulai Proses Crawling {PORTAL_NAME} (Tahun {START_YEAR}-{END_YEAR}) =====")
    if max_links:
        log_to_file(f"Target: Mengambil maksimal {max_links} link")
    
    grand_total_urls = []
    
    for year in range(START_YEAR, END_YEAR + 1):
        # Cek apakah sudah mencapai limit
        if max_links and len(grand_total_urls) >= max_links:
            log_to_file(f"Sudah mencapai limit {max_links} link. Menghentikan crawling.")
            break
            
        log_to_file(f"--- Memproses Tahun {year} ---")
        
        # Base URL sekarang dinamis berdasarkan tahun
        base_url_for_year = f"https://www.kotakgame.com/berita/index/{year}/0/0/"
        
        last_page = get_last_page(base_url_for_year)
        if last_page == 0:
            log_to_file(f"Melewati tahun {year} karena gagal mendapatkan info halaman.")
            continue # Lanjut ke tahun berikutnya

        urls_this_year = []
        for page_num in range(1, last_page + 1):
            # Cek limit sebelum memproses halaman baru
            if max_links and len(grand_total_urls) >= max_links:
                log_to_file(f"    -> Sudah mencapai limit {max_links} link. Melewati halaman berikutnya.")
                break
                
            page_url = f"{base_url_for_year}{page_num}/"
            log_to_file(f"    ({page_num}/{last_page}) Memproses halaman: {page_url}")
            
            try:
                response = requests.get(page_url, headers=HEADERS, timeout=15, verify=False)
                response.raise_for_status()
                
                soup = BeautifulSoup(response.content, "html.parser")
                article_links_found = soup.select('div#contenta div.detailfeature h4 a')
                
                page_urls = []
                for link_tag in article_links_found:
                    if link_tag.has_attr('href'):
                        relative_url = link_tag['href']
                        full_url = f"https://www.kotakgame.com{relative_url}"
                        page_urls.append(full_url)
                
                # Batasi jumlah URL yang diambil jika sudah mendekati limit
                if max_links:
                    remaining_links = max_links - len(grand_total_urls)
                    if remaining_links < len(page_urls):
                        page_urls = page_urls[:remaining_links]
                        log_to_file(f"      -> Membatasi ke {remaining_links} link untuk mencapai target {max_links}")
                
                log_to_file(f"      -> Ditemukan {len(page_urls)} link artikel.")
                urls_this_year.extend(page_urls)
                grand_total_urls.extend(page_urls)
                
                # Cek apakah sudah mencapai limit setelah menambah URL
                if max_links and len(grand_total_urls) >= max_links:
                    log_to_file(f"      -> Target {max_links} link tercapai!")
                    break

            except requests.exceptions.RequestException as e:
                log_to_file(f"      -> ERROR: Gagal mengambil {page_url}: {e}")
                continue
        
        log_to_file(f"--- Selesai Tahun {year}, ditemukan {len(urls_this_year)} link ---")

    if grand_total_urls:
        unique_urls = sorted(list(set(grand_total_urls)))
        log_to_file(f"Menyimpan total {len(unique_urls)} link unik dari semua tahun ke file {URL_OUTPUT_FILE}...")
        try:
            with open(URL_OUTPUT_FILE, 'a', encoding='utf-8') as f:
                for url in unique_urls:
                    f.write(f"{PORTAL_NAME};{url}\n")
            log_to_file("Penyimpanan link berhasil.")
        except IOError as e:
            log_to_file(f"ERROR: Gagal menulis ke file {URL_OUTPUT_FILE}: {e}")
    else:
        log_to_file("Tidak ada link baru yang ditemukan pada sesi crawling ini.")
    
    log_to_file(f"Total link yang didapat pada sesi ini: {len(grand_total_urls)}")
    log_to_file(f"Total link unik yang disimpan: {len(unique_urls) if grand_total_urls else 0}")
    log_to_file("===== Proses Crawling Selesai =====\n")

if __name__ == "__main__":
    # Panggil dengan parameter max_links
    # Ubah nilai MAX_LINKS sesuai kebutuhan, atau set None untuk mengambil semua
    crawl_kotakgame_multi_year(max_links=MAX_LINKS)

### Indogamers

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time 
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# --- Konfigurasi ---
BASE_URL = "https://indogamers.com/"
CATEGORIES = ['guides', 'pc', 'console', 'mobile'] 
URL_OUTPUT_FILE = "../data/crawled_urls.txt"
LOG_FILE = "../data/crawl_logs.txt"
MAX_LINKS = 500 

output_dir = os.path.dirname(URL_OUTPUT_FILE)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_to_file(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry.strip())
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry)

def crawl_indogamers_deep(max_links=None):
    """
    Crawl artikel dari Indogamers.com
    
    Args:
        max_links (int, optional): Jumlah maksimal link yang ingin diambil.
                                   Jika None, akan mengambil semua link.
    """
    log_to_file("===== Memulai Proses Crawling Mendalam Indogamers.com =====")
    if max_links:
        log_to_file(f"Target: Mengambil maksimal {max_links} link")
    
    total_urls_found_session = 0

    for category in CATEGORIES:
        # Cek apakah sudah mencapai limit
        if max_links and total_urls_found_session >= max_links:
            log_to_file(f"Sudah mencapai limit {max_links} link. Menghentikan crawling.")
            break
            
        log_to_file(f"Memulai kategori: '{category}'")
        page_num = 1
        urls_per_category = 0
        
        while True:
            # Cek limit sebelum memproses halaman baru
            if max_links and total_urls_found_session >= max_links:
                log_to_file(f"  -> Sudah mencapai limit {max_links} link. Melewati kategori '{category}'.")
                break
                
            page_url = f"{BASE_URL}{category}?page={page_num}"
            log_to_file(f"  -> Memproses halaman: {page_url}")
            
            try:
                response = requests.get(page_url, headers=HEADERS, timeout=20, verify=False)
                
                if response.status_code == 404:
                    log_to_file(f"    -> Halaman tidak ditemukan (404). Akhir dari kategori '{category}'.")
                    break

                response.raise_for_status()
                soup = BeautifulSoup(response.content, "html.parser")
                
                selector = "div[class*='article_recent__'] div[class*='article_recent_desc__'] h1 a"
                link_tags = soup.select(selector)
                
                if not link_tags:
                    log_to_file(f"    -> Tidak ada artikel ditemukan. Akhir dari kategori '{category}'.")
                    break
                
                page_urls = [tag['href'] for tag in link_tags if tag.has_attr('href')]
                
                # Batasi jumlah URL yang diambil jika sudah mendekati limit
                if max_links:
                    remaining_links = max_links - total_urls_found_session
                    if remaining_links < len(page_urls):
                        page_urls = page_urls[:remaining_links]
                        log_to_file(f"    -> Membatasi ke {remaining_links} link untuk mencapai target {max_links}")
                
                log_to_file(f"    -> Ditemukan {len(page_urls)} link artikel.")
                
                with open(URL_OUTPUT_FILE, 'a', encoding='utf-8') as f:
                    for url in page_urls:
                        f.write(f"Indogamers;{url}\n")
                
                urls_per_category += len(page_urls)
                total_urls_found_session += len(page_urls)
                
                # Cek apakah sudah mencapai limit setelah menambah URL
                if max_links and total_urls_found_session >= max_links:
                    log_to_file(f"    -> Target {max_links} link tercapai!")
                    break
                
                page_num += 1 
                time.sleep(1) 
                
            except requests.exceptions.RequestException as e:
                log_to_file(f"    -> ERROR: Gagal mengambil {page_url}: {e}. Mencoba lagi dalam 5 detik...")
                time.sleep(5)
                continue 
        
        log_to_file(f"Selesai kategori '{category}'. Total link ditemukan: {urls_per_category}")

    log_to_file(f"Total link yang didapat pada sesi ini: {total_urls_found_session}")
    log_to_file("===== Proses Crawling Selesai =====\n")

if __name__ == "__main__":
    # Panggil dengan parameter max_links
    # Ubah nilai MAX_LINKS sesuai kebutuhan, atau set None untuk mengambil semua
    crawl_indogamers_deep(max_links=MAX_LINKS)

### JagatPlay

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import re 
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# --- Konfigurasi ---
SITEMAP_INDEX_URL = "https://jagatplay.com/sitemap.html"
URL_OUTPUT_FILE = "../data/crawled_urls.txt"
LOG_FILE = "../data/crawl_logs.txt"
PORTAL_NAME = "Jagatplay"
MAX_LINKS = 500  # Jumlah maksimal link yang ingin diambil

output_dir = os.path.dirname(URL_OUTPUT_FILE)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_to_file(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry.strip())
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry)

def get_jagatplay_sitemap_number(url):
    """Mengekstrak nomor dari URL sitemap untuk sorting yang benar."""
    match = re.search(r'post-sitemap(\d+)\.html', url)
    return int(match.group(1)) if match else 0

def crawl_jagatplay(max_links=None):
    """
    Crawl artikel dari Jagatplay.com
    
    Args:
        max_links (int, optional): Jumlah maksimal link yang ingin diambil.
                                   Jika None, akan mengambil semua link.
    """
    log_to_file(f"===== Memulai Proses Crawling {PORTAL_NAME} =====")
    if max_links:
        log_to_file(f"Target: Mengambil maksimal {max_links} link")
    
    all_urls_found = []
    
    try:
        log_to_file(f"Mengambil sitemap index dari: {SITEMAP_INDEX_URL}")
        response_main = requests.get(SITEMAP_INDEX_URL, headers=HEADERS, timeout=15, verify=False)
        response_main.raise_for_status()
        soup_main = BeautifulSoup(response_main.content, "html.parser")
        
        sitemap_tags = soup_main.select('tr > td > a')
        all_sitemap_links = [tag['href'] for tag in sitemap_tags if tag.has_attr('href')]
        
        if not all_sitemap_links:
            log_to_file("ERROR: Tidak ada link sitemap yang ditemukan di halaman index.")
            return

        log_to_file(f"Total sitemap ditemukan di index: {len(all_sitemap_links)}")
        
        post_sitemap_links_numbered = [url for url in all_sitemap_links if re.search(r'post-sitemap\d+\.html', url)]
        
        post_sitemap_links_numbered.sort(key=get_jagatplay_sitemap_number)
        
        sitemap_limit = 11
        sitemaps_to_process = [
            url for url in post_sitemap_links_numbered 
            if get_jagatplay_sitemap_number(url) <= sitemap_limit
        ]
        
        log_to_file(f"Menyaring sitemap... Akan memproses {len(sitemaps_to_process)} sitemap (hingga post-sitemap{sitemap_limit}).")

        for i, sitemap_url in enumerate(sitemaps_to_process, 1):
            # Cek apakah sudah mencapai limit
            if max_links and len(all_urls_found) >= max_links:
                log_to_file(f"Sudah mencapai limit {max_links} link. Menghentikan crawling.")
                break
                
            log_to_file(f"({i}/{len(sitemaps_to_process)}) Memproses sitemap: {sitemap_url}")
            try:
                response_post = requests.get(sitemap_url, headers=HEADERS, timeout=15, verify=False)
                response_post.raise_for_status()

                # Gunakan lxml-xml dengan fallback ke html.parser
                try:
                    soup_post = BeautifulSoup(response_post.content, "lxml-xml")
                except:
                    soup_post = BeautifulSoup(response_post.content, "html.parser")
                
                article_links = []
                url_blocks = soup_post.find_all('url')
                for block in url_blocks:
                    loc_tag = block.find('loc')
                    if loc_tag:
                        article_links.append(loc_tag.text)
                
                # Batasi jumlah URL yang diambil jika sudah mendekati limit
                if max_links:
                    remaining_links = max_links - len(all_urls_found)
                    if remaining_links < len(article_links):
                        article_links = article_links[:remaining_links]
                        log_to_file(f"  -> Membatasi ke {remaining_links} link untuk mencapai target {max_links}")
                
                log_to_file(f"  -> Ditemukan {len(article_links)} link artikel.")
                all_urls_found.extend(article_links)
                
                # Cek apakah sudah mencapai limit setelah menambah URL
                if max_links and len(all_urls_found) >= max_links:
                    log_to_file(f"  -> Target {max_links} link tercapai!")
                    break

            except requests.exceptions.RequestException as e:
                log_to_file(f"  -> ERROR: Gagal mengambil atau memproses {sitemap_url}: {e}")
                continue

    except requests.exceptions.RequestException as e:
        log_to_file(f"FATAL ERROR: Gagal mengambil sitemap index utama. Proses dihentikan. Error: {e}")
        return

    if all_urls_found:
        unique_urls = sorted(list(set(all_urls_found)))
        log_to_file(f"Menyimpan total {len(unique_urls)} link unik ke file {URL_OUTPUT_FILE}...")
        try:
            with open(URL_OUTPUT_FILE, 'a', encoding='utf-8') as f:
                for url in unique_urls:
                    f.write(f"{PORTAL_NAME};{url}\n")
            log_to_file("Penyimpanan link berhasil.")
        except IOError as e:
            log_to_file(f"ERROR: Gagal menulis ke file {URL_OUTPUT_FILE}: {e}")
    else:
        log_to_file("Tidak ada link baru yang ditemukan pada sesi crawling ini.")
    
    log_to_file(f"Total link yang didapat pada sesi ini: {len(all_urls_found)}")
    log_to_file(f"Total link unik yang disimpan: {len(unique_urls) if all_urls_found else 0}")
    log_to_file("===== Proses Crawling Selesai =====\n")


if __name__ == "__main__":
    # Panggil dengan parameter max_links
    # Ubah nilai MAX_LINKS sesuai kebutuhan, atau set None untuk mengambil semua
    crawl_jagatplay(max_links=MAX_LINKS)

## **Scraping**

In [None]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
from datetime import datetime
import pandas as pd
import os
import time
import re
import urllib3

# Nonaktifkan pesan peringatan SSL
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# --- Konfigurasi ---
CRAWLED_URL_FILE = "../data/crawled_urls.txt"
OUTPUT_CSV_FILE = "../data/scraped_articles.csv"
LOG_FILE = "../data/scrape_logs.txt"

PORTALS_TO_SCRAPE = [
    "Gamebrott",
    "Kotakgame",
    "Indogamers",
    "Jagatplay"
]

os.makedirs(os.path.dirname(OUTPUT_CSV_FILE), exist_ok=True)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def log_to_file(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry.strip())
    with open(LOG_FILE, 'a', encoding='utf-8') as f:
        f.write(log_entry)

def read_all_urls_to_scrape(filepath, portal_names):
    log_to_file(f"Membaca SEMUA URL dari {filepath}...")
    all_tasks = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                if ';' in line:
                    portal, url = line.strip().split(';', 1)
                    if portal in portal_names:
                        all_tasks.append((portal, url))
        log_to_file(f"Total URL target ditemukan: {len(all_tasks)}")
        return all_tasks
    except FileNotFoundError:
        log_to_file(f"ERROR: File {filepath} tidak ditemukan.")
        return []

def get_already_scraped_urls(filepath):
    """Membaca CSV dan mengembalikan set URL yang sudah di-scrape."""
    if not os.path.exists(filepath):
        return set()
    try:
        df = pd.read_csv(filepath)
        if 'url' in df.columns:
            return set(df['url'])
        else:
            log_to_file("WARNING: Kolom 'url' tidak ditemukan di CSV. Tidak bisa melanjutkan. Harap hapus file CSV lama atau tambahkan kolom 'url'.")
            return set()
    except pd.errors.EmptyDataError:
        return set()
    except Exception as e:
        log_to_file(f"Error saat membaca CSV yang ada: {e}. Mengasumsikan file kosong.")
        return set()

def extract_text_with_links(element):
    """Ekstrak teks dari elemen termasuk tag <a>, lalu bersihkan HTML belakangan."""
    if not element:
        return 'N/A'
    
    # Ambil semua paragraph
    paragraphs = element.find_all('p')
    content_parts = []
    
    for p in paragraphs:
        # Ambil teks dengan mempertahankan <a> tag sementara
        text = p.get_text(separator=' ', strip=True)
        if text:
            content_parts.append(text)
    
    return '\n'.join(content_parts) if content_parts else 'N/A'

def scrape_kotakgame_article(url, soup):
    """Scrape artikel dari Kotakgame"""
    title = 'N/A'
    thumbnail_url = 'N/A'
    publish_date = 'N/A'
    content = 'N/A'
    
    # Judul - ada di class bagiankiri > h3.judulh3
    bagian_kiri = soup.select_one('.bagiankiri')
    if bagian_kiri:
        title_tag = bagian_kiri.select_one('h3.judulh3')
        if title_tag:
            title = title_tag.get_text(strip=True)
    
    # Thumbnail - ada di wrapimg > img src
    thumb_tag = soup.select_one('.wrapimg img')
    if thumb_tag and thumb_tag.has_attr('src'):
        relative_url = thumb_tag['src']
        thumbnail_url = f"https://www.kotakgame.com{relative_url}" if relative_url.startswith('/') else relative_url
    
    # Tanggal - ada di boxwidget > boxcreate > span.txtcreate2
    date_span = soup.select_one('.boxwidget .boxcreate .txtcreate2')
    if date_span:
        publish_date = date_span.get_text(strip=True)
    
    # Konten - ada di isinewsp, ambil semua <p>
    content_div = soup.select_one('.isinewsp')
    if content_div:
        content = extract_text_with_links(content_div)
    
    return {
        "judul": title,
        "konten": content,
        "tanggal_terbit": publish_date,
        "url_thumbnail": thumbnail_url
    }

def scrape_indogamers_article(url, soup):
    title_tag = soup.select_one('h1[class*="style_article__title__"]')
    title = title_tag.get_text(strip=True) if title_tag else 'N/A'
    thumb_tag = soup.select_one('div[class*="style_image__article__"] img')
    if thumb_tag and thumb_tag.has_attr('srcset'):
        last_url_part = thumb_tag['srcset'].split(',')[-1].strip()
        relative_url = last_url_part.split(' ')[0]
        thumbnail_url = f"https://indogamers.com{relative_url}"
    else: thumbnail_url = 'N/A'
    date_container = soup.select_one('div[class*="style_author__box__"]')
    publish_date = 'N/A'
    if date_container:
        all_spans = date_container.find_all('span')
        for span in all_spans:
            span_text = span.get_text(strip=True)
            if re.search(r'\b(Senin|Selasa|Rabu|Kamis|Jumat|Sabtu|Minggu)\b', span_text):
                publish_date = span_text.split(',')[0].strip()
                break
    # content_div = soup.find('article[class*=style_content__article__]')
    content_div = soup.find('article', class_=re.compile(r'style_content__article___'))
    content = content_div.decode_contents() if content_div else 'N/A'
    # content = 'N/A'
    # if content_div:
    #     paragraphs = content_div.find_all('p')
    #     content = '\n'.join([p.get_text(strip=True) for p in paragraphs if not p.has_attr('class') or 'caption' not in ''.join(p['class'])])
    return {"judul": title, "konten": content, "tanggal_terbit": publish_date, "url_thumbnail": thumbnail_url}

def scrape_gamebrott_article(url, soup):
    """Scrape artikel dari Gamebrott"""
    title = 'N/A'
    thumbnail_url = 'N/A'
    publish_date = 'N/A'
    content = 'N/A'
    
    # Judul - class post-wrapper > h1.jeg_post_title
    post_wrapper = soup.select_one('.post-wrapper')
    if post_wrapper:
        title_tag = post_wrapper.select_one('h1.jeg_post_title')
        if title_tag:
            title = title_tag.get_text(strip=True)
    
    # Thumbnail - class jeg_featured.featured_image > thumbnail-container > img src
    thumb_tag = soup.select_one('.jeg_featured.featured_image .thumbnail-container img')
    if thumb_tag and thumb_tag.has_attr('src'):
        thumbnail_url = thumb_tag['src']
    
    # Tanggal - div.jeg_meta_container > div.jeg_meta_date > a
    date_tag = soup.select_one('.jeg_meta_container .jeg_meta_date a')
    if date_tag:
        publish_date = date_tag.get_text(strip=True)
    
    # Konten - div.entry-content.no-share > div.content-inner.jeg_link_underline > p
    content_div = soup.select_one('.entry-content.no-share .content-inner.jeg_link_underline')
    if content_div:
        content = extract_text_with_links(content_div)
    
    return {
        "judul": title,
        "konten": content,
        "tanggal_terbit": publish_date,
        "url_thumbnail": thumbnail_url
    }

def scrape_jagatplay_article(url, soup):
    """Scrape artikel dari Jagatplay"""
    title = 'N/A'
    thumbnail_url = 'N/A'
    publish_date = 'N/A'
    content = 'N/A'
    
    # Cari div#mainContent dulu
    main_content = soup.select_one('div#mainContent')
    if not main_content:
        main_content = soup  # Fallback ke soup utama
    
    # Judul - div.jgpost__header > h1
    header = main_content.select_one('.jgpost__header')
    if header:
        title_tag = header.select_one('h1')
        if title_tag:
            title = title_tag.get_text(strip=True)
    
    # Thumbnail - class jgpost__feat-img, ambil background url dari style
    feat_img = main_content.select_one('.jgpost__feat-img')
    if feat_img and feat_img.has_attr('style'):
        style = feat_img['style']
        match = re.search(r"url\(['\"]?(.*?)['\"]?\)", style)
        if match:
            thumbnail_url = match.group(1)
    
    # Tanggal - div.jgpost__content > div.jgauthor.breakout > div.jgauthor__posted > div
    author_posted = main_content.select_one('.jgpost__content .jgauthor.breakout .jgauthor__posted')
    if author_posted:
        # Cari div yang berisi tanggal (biasanya div terakhir atau yang tidak punya tag <a>)
        divs = author_posted.find_all('div', recursive=False)
        for div in divs:
            if not div.find('a'):  # Div tanpa link biasanya berisi tanggal
                publish_date = div.get_text(strip=True)
                break
    
    # Konten - div.jgpost__content > p
    content_div = main_content.select_one('.jgpost__content')
    if content_div:
        content = extract_text_with_links(content_div)
    
    return {
        "judul": title,
        "konten": content,
        "tanggal_terbit": publish_date,
        "url_thumbnail": thumbnail_url
    }

def main():
    log_to_file("===== Memulai Proses Scraping Skala Penuh (Mode Resume + Real-time Save) =====")
    
    all_tasks = read_all_urls_to_scrape(CRAWLED_URL_FILE, PORTALS_TO_SCRAPE)
    
    header = ['id_dokumen', 'sumber', 'url', 'judul', 'konten', 'tanggal_terbit', 'url_thumbnail']
    
    already_scraped_urls = get_already_scraped_urls(OUTPUT_CSV_FILE)
    if already_scraped_urls:
        log_to_file(f"Ditemukan {len(already_scraped_urls)} URL yang sudah diproses. Akan melanjutkan.")
    
    doc_id_counter = len(already_scraped_urls) + 1
    
    if not os.path.exists(OUTPUT_CSV_FILE) or not already_scraped_urls:
        log_to_file(f"File {OUTPUT_CSV_FILE} tidak ada atau kosong. Membuat file baru dengan header.")
        pd.DataFrame(columns=header).to_csv(OUTPUT_CSV_FILE, index=False)
        doc_id_counter = 1
    
    total_urls_to_process = len(all_tasks)
    newly_scraped_count = 0
    
    for i, (portal, url) in enumerate(all_tasks):
        # Lewati URL yang sudah ada
        if url in already_scraped_urls:
            continue
            
        log_to_file(f"  ({i+1}/{total_urls_to_process}) Scraping: {url}")
        
        try:
            response = requests.get(url, headers=HEADERS, timeout=15, verify=False)
            if response.status_code != 200:
                log_to_file(f"    -> Gagal mengakses (Status: {response.status_code})")
                continue
            
            soup = BeautifulSoup(response.content, 'html.parser')

            data = None
            if portal == "Gamebrott":
                data = scrape_gamebrott_article(url, soup)
            elif portal == "Kotakgame":
                data = scrape_kotakgame_article(url, soup)
            elif portal == "Indogamers":
                data = scrape_indogamers_article(url, soup)
            elif portal == "Jagatplay":
                data = scrape_jagatplay_article(url, soup)
            
            if data:
                data['id_dokumen'] = f"doc_{doc_id_counter:05d}"
                data['sumber'] = portal
                data['url'] = url
                
                df_row = pd.DataFrame([data])
                df_row = df_row[header]
                
                df_row.to_csv(OUTPUT_CSV_FILE, mode='a', index=False, header=False)
                
                log_to_file(f"    -> Berhasil: {data['judul'][:50]}...")
                
                doc_id_counter += 1
                newly_scraped_count += 1

            time.sleep(0.5)

        except Exception as e:
            log_to_file(f"    -> Terjadi error kritis saat scraping {url}: {e}")

    log_to_file(f"Scraping selesai. Total {newly_scraped_count} artikel BARU berhasil disimpan ke {OUTPUT_CSV_FILE}.")
    log_to_file(f"Total artikel keseluruhan: {doc_id_counter - 1}")
    log_to_file(f"Proses selesai.\n")

if __name__ == "__main__":
    main()

## **Preprocessing**

### EDA

In [1]:
import pandas as pd

df_article = pd.read_csv('../data/scraped_articles.csv')
print("Preview Dataframe")
print(df_article.head())

print("Statistik Deskriptif Untuk Dataframe Articles")
print(df_article.describe())

print("Tipe Data Dataframe Articles")
print(df_article.dtypes)

Preview Dataframe
  id_dokumen     sumber                                                url  \
0  doc_00001  Gamebrott                        https://gamebrott.com/blog/   
1  doc_00002  Gamebrott  https://gamebrott.com/microsoft-investigasi-pe...   
2  doc_00003  Gamebrott  https://gamebrott.com/amd-klarifikasi-soal-soc...   
3  doc_00004  Gamebrott  https://gamebrott.com/detail-trailer-fallout-s...   
4  doc_00005  Gamebrott  https://gamebrott.com/kisah-clair-obscur-exped...   

                                               judul  \
0                                                NaN   
1  Microsoft Investigasi Penyebab Masalah SSD, Be...   
2  AMD Klarifikasi Masalah Socket AM5 Terbakar, S...   
3  Gamescom 2025¬†‚Äî Detail Trailer Fallout Season ...   
4  Kisah Clair Obscur Expedition 33, Mulai dari A...   

                                              konten   tanggal_terbit  \
0                                                NaN              NaN   
1  Beberapa hari lalu, perm

### Missing Values

In [None]:
print(df_article.isnull().sum())

df_null = df_article.dropna()
print(f"Jumlah baris sebelum dihandling: {df_article.shape[0]}")
print(f"Jumlah baris setelah dihandling: {df_null.shape[0]}")

df_null.isnull().sum()
df_article = df_null


### Duplicate

In [None]:
df_article.duplicated().sum()

### Date Standardize

In [None]:
unique_sources = df_article['sumber'].unique()
sample_rows = []

for source in unique_sources:
    sample = df_article[df_article['sumber'] == source].head(1)
    sample_rows.append(sample)

if sample_rows:
    temp_df = pd.concat(sample_rows)
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.width', 1000)
    print(temp_df[['sumber', 'tanggal_terbit']])

In [None]:
import pandas as pd
import re

df_article 

# Dictionary bulan Inggris ke Indonesia
bulan_dict = {
    'January': 'Januari', 'February': 'Februari', 'March': 'Maret', 'April': 'April',
    'May': 'Mei', 'June': 'Juni', 'July': 'Juli', 'August': 'Agustus', 'September': 'September',
    'October': 'Oktober', 'November': 'November', 'December':'Desember',
    'Jan': 'Januari', 'Feb': 'Februari', 'Mar': 'Maret', 'Apr': 'April', 'May': 'Mei',
    'Jun': 'Juni', 'Jul': 'Juli', 'Aug': 'Agustus', 'Sep': 'September', 'Oct': 'Oktober',
    'Nov': 'November', 'Dec':'Desember','Agu':'Agustus','Okt':'Oktober','Des':'Desember'
}

# Pembersihan dan konversi
def convert_date(text):
    if not isinstance(text, str):
        return 'N/A'
    
    original_text = text.strip()

    if '|' in original_text:
        text = original_text.split('|', 1)[-1].strip()
    
    if re.search(r'(?i)\b\d+\s+(Hari|Jam)\s+yang\s+lalu\b', text):
        return '20 November 2025'
    
    # Buang hari & jam jika ada
    text = re.sub(r'(?i)\b(Minggu|Senin|Selasa|Rabu|Kamis|Jumat|Sabtu),?\s*', '', text)
    text = re.sub(r'\b(Minggu|Senin|Selasa|Rabu|Kamis|Jumat|Sabtu)\s*', '', text)
    text = re.sub(r'\d{2}:\d{2}$', '', text)
    
    # Ubah 'July 31, 2012' ke '31 July 2012'
    m = re.match(r'([A-Za-z]+) (\d{1,2}), (\d{4})', text.strip())
    if m:
        text = f"{m.group(2)} {m.group(1)} {m.group(3)}"
    
    # Ubah ke list lalu ganti bulan
    parts = text.strip().split()
    if len(parts) >= 3:
        hari, bulan_raw, tahun = parts[0], parts[1], parts[2]
    else:
        return text.strip()
    # Ganti bulan
    bulan = bulan_dict.get(bulan_raw, bulan_raw)
    # Format ulang
    return f"{hari} {bulan} {tahun}"

df_article['tanggal_terbit_normalized'] = df_article['tanggal_terbit'].apply(convert_date)

unique_sources = df_article['sumber'].unique()
sample_rows = []

for source in unique_sources:
    sample = df_article[df_article['sumber'] == source].head(1)
    sample_rows.append(sample)

if sample_rows:
    temp_df = pd.concat(sample_rows)
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.width', 1000)
    print(temp_df[['sumber', 'tanggal_terbit', 'tanggal_terbit_normalized']])

In [None]:
import pandas as pd
df_article_cleaned = df_article.drop('tanggal_terbit', axis=1)
df_article_cleaned = df_article_cleaned.rename(columns={'tanggal_terbit_normalized': 'tanggal_terbit'})

# Kamus untuk mengubah tanggal_terbit dari format object menjadi datetime
ID_TO_EN_MAP = {
    'Januari': 'January',
    'Februari': 'February',
    'Maret': 'March',
    'April': 'April',
    'Mei': 'May',
    'Juni': 'June',
    'Juli': 'July',
    'Agustus': 'August',
    'September': 'September',
    'Oktober': 'October',
    'November': 'November',
    'Desember': 'December'
}

def convert_indo_date_to_datetime(date_str):
    """Mengubah string tanggal Indo (15 Agustus 2025) ke datetime object."""
    if not isinstance(date_str, str) or date_str == 'N/A':
        return pd.NaT
    
    # Terjemahkan nama bulan di dalam string ke Inggris
    # Contoh: "15 Agustus 2025" -> "15 August 2025"
    date_str_en = date_str
    for id_month, en_month in ID_TO_EN_MAP.items():
        if id_month in date_str:
            date_str_en = date_str.replace(id_month, en_month)
            break
            
    # Sekarang Pandas bisa membacanya dengan mudah
    try:
        return pd.to_datetime(date_str_en, format='%d %B %Y')
    except Exception:
        return pd.NaT

# 2. Terapkan ke kolom baru 'timestamp'
print("Sedang membuat kolom timestamp...")
df_article_cleaned['timestamp'] = df_article_cleaned['tanggal_terbit'].apply(convert_indo_date_to_datetime)

# 3. Cek Hasilnya
print("\nCek tipe data:")
print(df_article_cleaned.dtypes)

print("\nContoh data:")
# Tampilkan kolom tanggal (string indo) dan timestamp (datetime) berdampingan
display(df_article_cleaned[['tanggal_terbit', 'timestamp']].head())

# Opsional: Cek apakah masih ada NaT (selain yang memang N/A)
jumlah_nat = df_article_cleaned['timestamp'].isna().sum()
print(f"\nJumlah baris yang gagal dikonversi (NaT): {jumlah_nat}")
timestamp_nat = df_article_cleaned['timestamp'].isna()
failed_rows = df_article_cleaned[timestamp_nat]

if not failed_rows.empty:
    print(f"\nDitemukan {len(failed_rows)} baris yang gagal dikonversi menjadi datetime (NaT).")
    print("Berikut adalah sampel dari baris-baris yang gagal tersebut:")
    
    # 'tanggal_terbit' adalah kolom yang paling penting untuk dianalisis
    display(failed_rows[['sumber', 'tanggal_terbit', 'timestamp']])
    
    print("\n--- Analisis Frekuensi Format Tanggal yang Gagal ---")
    print("Berikut adalah format-format tanggal unik yang paling sering menyebabkan kegagalan:")
    
    display(failed_rows['tanggal_terbit'].value_counts().head(20))
    
else:
    print("\nSelamat! Tidak ada baris dengan nilai NaT di kolom 'timestamp'.")

print(df_article_cleaned.head(5))

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import re

TODAY = datetime(2025, 11, 10)

ID_TO_EN_MAP = {
    'Januari': 'January', 'Februari': 'February', 'Maret': 'March', 'April': 'April',
    'Mei': 'May', 'Juni': 'June', 'Juli': 'July', 'Agustus': 'August',
    'September': 'September', 'Oktober': 'October', 'November': 'November', 'Desember': 'December'
}

def convert_to_datetime(date_str):
    if not isinstance(date_str, str) or date_str.lower() == 'n/a':
        return pd.NaT
    
    match = re.search(r'(\d+)\s+Hari yang', date_str, re.IGNORECASE)
    if match:
        days_ago = int(match.group(1))
        calculated_date = TODAY - timedelta(days=days_ago)
        return calculated_date
    
    date_str_en = date_str
    for id_month, en_month in ID_TO_EN_MAP.items():
        if id_month in date_str:
            date_str_en = date_str.replace(id_month, en_month)
            break

    try:
        return pd.to_datetime(date_str_en, format='%d %B %Y')
    except Exception:
        # Jika gagal, kembalikan NaT
        print(f"Gagal mem-parsing tanggal absolut: '{date_str}'")
        return pd.NaT

print("--- DataFrame SEBELUM konversi tanggal relatif ---")
df_article_cleaned.head()
print("-" * 50)

# Terapkan fungsi konversi baru untuk membuat kolom 'timestamp'
print("Mengonversi semua format tanggal ke tipe datetime...")
df_article_cleaned['timestamp'] = df_article_cleaned['tanggal_terbit'].apply(convert_to_datetime)

def format_to_indonesian_str(dt_object):
    if pd.isna(dt_object):
        return 'N/A'
    english_date_str = dt_object.strftime('%d %B %Y')
    for en_month, id_month in ID_TO_EN_MAP.items():
        english_date_str = english_date_str.replace(en_month, id_month)
    return english_date_str

# Timpa kolom 'tanggal_terbit' yang lama dengan format yang sudah konsisten
df_article_cleaned['tanggal_terbit'] = df_article_cleaned['timestamp'].apply(format_to_indonesian_str)

print("Konversi selesai.")
print("\n--- DataFrame SETELAH konversi ---")
df_article_cleaned.head()
print("\nTipe data akhir:")
df_article_cleaned.info()

In [None]:
df_article = df_article_cleaned
df_article.isna().sum()

### Content Preprocessing

#### Create Whitelist

In [None]:
import pandas as pd
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Daftar kata kerja/umum yang HARUS DIKELUARKAN dari WHITELIST agar bisa di-stemming
WORDS_TO_EXCLUDE_FROM_WHITELIST = set([
    "bawa", "beli", "bepergian", "beredar", "berencana", "berhasil", "berkurang", "bermain", "beruntunnya", 
    "diam", "dibatalkan", "dicap", "diduga", "digelar", "dihapus", "dihentikan", "dikembangkan", "dikenalkan", 
    "dilengkapi", "dirilis", "dirumorkan", "disebar", "diskon", "ditangkap", "ditolak", "ditunda", "diumumkan", 
    "diungkap", "habiskan", "hadir", "hadirkan", "kecanduan", "kecipratan", "kejar", "kelarkan", "kesalahan", 
    "ketahuan", "ketahui", "keuntungan", "melengkapi", "meluncur", "memainkannya", "memanjat", "memilih",
    "menanggapinya", "menawarkan", "mencapai", "mencekam", "mengalami", "mengambil", "mengatasi", "mengerti", 
    "menghabiskan", "menghilang", "meningkat", "menjaga", "merawat", "merusak", "muncul", "paham", "pakai", 
    "pamer", "pamerkan", "perbincangan", "percaya", "perdana", "performa", "perilisan", "perlihatkan", 
    "perusahaan", "terbakar", "terbang", "tercepat", "terhentikan", "terinspirasi", "terjual", "terlaris", 
    "tetapkan", "tambahkan", "tanggapi", "umumkan",
    
    # Tambahan lain yang bersifat umum (dari contoh sebelumnya)
    "cara", "tips", "trik", "terbaru", "akan", "bikin", "terbaik", "terlengkap", "wajib", "dibuka", "dimainkan", 
    "gratis", "ketagihan", "seru", "lupa", "coba", "dicoba", "keren", "mainkan", "banget", "lengkap",
    "pembayaran", "pemecatan", "pemerintahan", "pengalaman", "tanggal", "director", "nggak", "orang", "dirumorkan",
    "ps", "dirilis"
])


def clean_for_ngram(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        return text
    return ""

df_article['judul_clean'] = df_article['judul'].apply(clean_for_ngram)

# Menggunakan Stopwords Gabungan (Indonesia dan Inggris)
STOPWORDS_COMBINED = set(stopwords.words('indonesian')) | set(stopwords.words('english'))

def get_top_ngrams(corpus, n=2, top_k=500):
    """Fungsi untuk mendapatkan N-gram yang paling sering muncul."""
    
    # Inisialisasi CountVectorizer
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words=list(STOPWORDS_COMBINED))
    
    X = vectorizer.fit_transform(corpus)
    
    word_counts = X.sum(axis=0)
    word_freq = [(word, word_counts[0, idx]) 
                 for word, idx in vectorizer.vocabulary_.items()]
    
    word_freq.sort(key=lambda x: x[1], reverse=True)
    return word_freq[:top_k]

# Ekstrak N-gram
top_bigrams = get_top_ngrams(df_article['judul_clean'], n=2, top_k=500)
top_trigrams = get_top_ngrams(df_article['judul_clean'], n=3, top_k=500)

print("### Top 20 Bigram ###")
for phrase, count in top_bigrams[:20]:
    print(f"'{phrase}' (Count: {count})")

print("\n### Top 20 Trigram ###")
for phrase, count in top_trigrams[:20]:
    print(f"'{phrase}' (Count: {count})")

# 1. Kumpulkan semua kata kandidat dari N-gram
potential_whitelist_words = set()

for phrase, count in top_bigrams:
    for word in phrase.split():
        potential_whitelist_words.add(word)

for phrase, count in top_trigrams:
    for word in phrase.split():
        potential_whitelist_words.add(word)
        
# 2. Tambahkan Jargon/Akronim Hardcoded (sebelum filtering)
potential_whitelist_words.update([
    "the", "of", "and", "or", "in", "with", 
    "buff", "nerf", "meta", "patch", "dlc", "rts", "fps", "moba", "rpg",
    "pc", "ps5", "xbox", "nintendo", "steam", "mobile", "e3" # Penting untuk akronim/entitas
])

print(f"\nTotal kata unik (token) sebelum filtering: {len(potential_whitelist_words)}")

# 3. FILTERING FINAL: Hanya kata yang TIDAK termasuk WORDS_TO_ALLOW_STEMMING
final_whitelist_words = [
    word for word in potential_whitelist_words if word not in WORDS_TO_EXCLUDE_FROM_WHITELIST
]

# Nama file yang akan digunakan untuk menyimpan whitelist
FILE_PATH = '../data/whitelist.txt'

with open(FILE_PATH, 'w', encoding='utf-8') as f:
    for word in final_whitelist_words:
        f.write(word + '\n')

print(f"\n‚úÖ Whitelist bersih (total {len(final_whitelist_words)} kata) berhasil disimpan ke '{FILE_PATH}'.")

#### Create Stopwords List

In [None]:
import pandas as pd
from collections import Counter
import re
import os 
import nltk
from nltk.corpus import stopwords 

# =======================================================
# 1. FUNGSI DAN MUAT KOMPONEN
# =======================================================

# üö® Penambahan try-except untuk download NLTK stopwords
try:
    stopwords.words('indonesian')
except LookupError:
    print("‚è≥ NLTK stopwords Bahasa Indonesia belum diunduh. Sedang mengunduh...")
    nltk.download('stopwords')
    print("‚úÖ Pengunduhan NLTK stopwords selesai.")

def load_words_set(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f if line.strip())
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Peringatan: File {file_path} tidak ditemukan. Menggunakan set kosong.")
        return set()

# Asumsi lokasi file Whitelist
WHITELIST_FILE_PATH = '../data/whitelist.txt'
WHITELIST = load_words_set(WHITELIST_FILE_PATH)

# Menggunakan Stoplist Bahasa Indonesia dari NLTK
NLTK_ID_STOPWORDS = set(stopwords.words('indonesian'))
STANDARD_STOPWORDS = NLTK_ID_STOPWORDS 

# --- Kata-kata yang TIDAK BOLEH ADA di Final Stoplist ---
WORDS_TO_ALLOW_STEMMING = set([
    "bawa", "beli", "bepergian", "beredar", "berencana", "berhasil", "berkurang", "bermain", "beruntunnya", 
    "diam", "dibatalkan", "dicap", "diduga", "digelar", "dihapus", "dihentikan", "dikembangkan", "dikenalkan", 
    "dilengkapi", "dirilis", "dirumorkan", "disebar", "diskon", "ditangkap", "ditolak", "ditunda", "diumumkan", 
    "diungkap", "habiskan", "hadir", "hadirkan", "kecanduan", "kecipratan", "kejar", "kelarkan", "kesalahan", 
    "ketahuan", "ketahui", "keuntungan", "melengkapi", "meluncur", "memainkannya", "memanjat", "memilih",
    "menanggapinya", "menawarkan", "mencapai", "mencekam", "mengalami", "mengambil", "mengatasi", "mengerti", 
    "menghabiskan", "menghilang", "meningkat", "menjaga", "merawat", "merusak", "muncul", "paham", "pakai", 
    "pamer", "pamerkan", "perbincangan", "percaya", "perdana", "performa", "perilisan", "perlihatkan", 
    "perusahaan", "terbakar", "terbang", "tercepat", "terhentikan", "terinspirasi", "terjual", "terlaris", 
    "tetapkan", "tambahkan", "tanggapi", "umumkan"
])


CURATED_ADDITIONAL_STOPWORDS = set([
    "baca", "juga", "berita", "lain", "kembali", "lanjut", 
    "sob", "gaes", "yuk", "dunia", "maya", "segera", "download", 
    "jangan", "lupa", "contoh", "terkait", "halaman", "akhir", "artikel"
])

# =======================================================
# 2. PEMBENTUKAN FINAL STOPLIST
# =======================================================

# 1. Gabungkan semua kata yang ingin dihapus (STANDAR NLTK ID + CURATED NOISE)
all_stopwords_to_remove = STANDARD_STOPWORDS | CURATED_ADDITIONAL_STOPWORDS

# 2. Kurangi dengan WHITELIST (untuk melindungi entitas/jargon)
FINAL_STOPLIST = all_stopwords_to_remove - WHITELIST

# 3. KOREKSI TAMBAHAN: Pastikan semua kata kerja yang diizinkan untuk stemming TIDAK masuk ke stoplist
FINAL_STOPLIST = FINAL_STOPLIST - WORDS_TO_ALLOW_STEMMING

print(f"\nTotal Stopwords Kandidat (NLTK ID + Tambahan): {len(all_stopwords_to_remove)}")
print(f"Total Kata Kunci di Whitelist: {len(WHITELIST)}")
print(f"Total Kata Kerja yang Diizinkan Stemming: {len(WORDS_TO_ALLOW_STEMMING)}")
print(f"‚úÖ Total kata di FINAL STOPLIST: {len(FINAL_STOPLIST)}")
print("-" * 30)

# --- PENYIMPANAN FINAL STOPLIST KE FILE TEKS ---
FINAL_STOPLIST_FILE_PATH = '../data/final_stopwords.txt'

if not os.path.exists(os.path.dirname(FINAL_STOPLIST_FILE_PATH)):
    os.makedirs(os.path.dirname(FINAL_STOPLIST_FILE_PATH))

with open(FINAL_STOPLIST_FILE_PATH, 'w', encoding='utf-8') as f:
    for word in sorted(list(FINAL_STOPLIST)): 
        f.write(word + '\n')

print(f"‚úÖ FINAL STOPLIST berhasil disimpan ke '{FINAL_STOPLIST_FILE_PATH}'.")

#### Text Preprocessing

In [None]:
import pandas as pd
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm.auto import tqdm
import numpy as np

df_article

WHITELIST_FILE_PATH = "../data/whitelist.txt"
FINAL_STOPLIST_PATH = "../data/final_stopwords.txt"
CHECKPOINT_FILE_PATH = "../data/preprocessing_checkpoint.txt"
OUTPUT_CSV_FILE = "../data/processed_data.csv"

stemmer = StemmerFactory().create_stemmer()

def load_file(file_path):
    """Memuat set kata dari file teks."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f if line.strip())
    except FileNotFoundError:
        print(f" Peringatan: File {file_path} tidak ditemukan. Menggunakan set kosong.")
        return set()
    
WHITELIST = load_words_set(WHITELIST_FILE_PATH)
FINAL_STOPLIST = load_words_set(FINAL_STOPLIST_FILE_PATH)

def preprocess_text(text):
    text = re.sub(r'<[^>]*>', ' ', text)
    text = text.lower()
    text = text.replace("-", "")
    text = re.sub(r'[^\w\s]',' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = text.split()

    tokens_filtered = [word for word in tokens if word not in FINAL_STOPLIST]

    stemmed_tokens = []
    for word in tokens_filtered:
        if any(char.isalpha() for char in word) and any(char.isdigit() for char in word):
            stemmed_tokens.append(word)
            continue
            
        if word.isdigit():
            stemmed_tokens.append(word)
            continue
            
        if word in WHITELIST:
            stemmed_tokens.append(word)
            continue # Tambahkan continue agar tidak jatuh ke stemming di 'else'
        
        stemmed_tokens.append(stemmer.stem(word))
    
    return ' '.join(stemmed_tokens)

if 'konten_processed' not in df_article.columns:
    df_article['konten_processed'] = np.nan

total_rows = len(df_article)

start_index = 0
if os.path.exists(CHECKPOINT_FILE_PATH):
    with open(CHECKPOINT_FILE_PATH, 'r') as f:
        try:
            start_index = int(f.read().strip())
        except ValueError:
            start_index = 0

if start_index >= total_rows:
    print("Preprocessing selesai")
else:
    print(f"\n--- Memulai Proses Preprocessing Kolom 'konten_separated' ---")
    print(f"Melanjutkan dari baris ke-{start_index} dari {total_rows}...")

    for i in tqdm(range(start_index, total_rows),
                  initial=start_index,
                  total=total_rows,
                  desc="Preprocessing Konten"):
    
        if pd.isna(df_article.iloc[i]['konten_processed']):
            text_to_process = df_article.iloc[i]['konten']
            processed_text = preprocess_text(text_to_process)
            df_article.iloc[i, df_article.columns.get_loc('konten_processed')] = processed_text

            if (i + 1) % 1000 == 0 or i == total_rows - 1:
                with open(CHECKPOINT_FILE_PATH, 'w') as f:
                    f.write(str(i + 1))
    
    print("SELESAI")

if os.path.exists(CHECKPOINT_FILE_PATH) and df_article['konten_processed'].notna().all():
    os.remove(CHECKPOINT_FILE_PATH)
    print("File checkpoint dihapus.")

if 'konten' in df_article.columns and len(df_article) > 0:
    print("\n--- Contoh Perbandingan Hasil ---")
    
    for i in range(min(5, total_rows)):
        print(f"\nBaris ke-{i}:")
        print(f"Konten Asli: {str(df_article.iloc[i]['konten'])[:100]}...")
        print(f"Konten Diproses: {df_article.iloc[i]['konten_processed']}")
else:
    print("DataFrame sampel kosong atau kolom 'konten' tidak ada.")

# Ekstrak DataFrame yang sudah diproses ke CSV baru
df_article.to_csv(OUTPUT_CSV_FILE, index=False)
print(f"\nDataFrame telah diekstrak ke '{OUTPUT_CSV_FILE}'.")


## **Indexing and Modeling**

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

MODEL_DIR = "../models/"
os.makedirs(MODEL_DIR, exist_ok=True)

BM25_MODEL_PATH = os.path.join(MODEL_DIR, 'bm25_model.pkl')
CORPUS_DF_PATH = os.path.join(MODEL_DIR, 'df_corpus.pkl')

SBERT_MODEL_PATH = os.path.join(MODEL_DIR, 'sbert_model.pkl')
SBERT_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'sbert_embeddings.npy')

df = pd.read_csv('../data/processed_data.csv')
df = df.drop_duplicates(subset=['konten_processed'], keep='first')

# BM25

# Tokenizing corpus
tokenized_corpus = df['konten_processed'].apply(lambda x: str(x).split()).tolist()

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

print("BM25 Indexing Selesai.")
print(f"Total dokumen terindeks oleh BM25: {len(tokenized_corpus)}")

print("\nEXPORTING BM25 MODEL")
try:
    with open(BM25_MODEL_PATH, "wb") as f:
        pickle.dump(bm25, f)
    print(f"Model BM25 berhasil disimpan ke: {BM25_MODEL_PATH}")
except Exception as e:
    print(f"ERROR saat menyimpan BM25: {e}")

try:
    df.to_pickle(CORPUS_DF_PATH)
    print(f"DataFrame Corpus berhasil disimpan ke: {CORPUS_DF_PATH}")
except Exception as e:
    print(f"ERROR saat menyimpan DataFrame: {e}")


# SBERT
model_name = 'paraphrase-multilingual-mpnet-base-v2'
sbert_model = SentenceTransformer(model_name)

corpus_texts = df['konten'].astype(str).tolist()

corpus_embeddings = sbert_model.encode(
    corpus_texts,
    show_progress_bar=True,
    convert_to_tensor=True
)

corpus_embeddings = corpus_embeddings.cpu().numpy()
print("S-BERT Indexing (Embeddings) Selesai.")
print("\nEXPORTING SBERT MODEL")

try:
    np.save(SBERT_EMBEDDINGS_PATH, corpus_embeddings)
    print(f"Corpus Embeddings berhasil disimpan ke: {SBERT_EMBEDDINGS_PATH}")
except Exception as e:
    print(f"ERROR saat menyimpan Embeddings: {e}")

try:
    with open(SBERT_MODEL_PATH, 'wb') as f:
        pickle.dump(model_name, f)
    print(f"Nama Model SBERT berhasil disimpan ke: {SBERT_MODEL_PATH}")
except Exception as e:
    print(f"ERROR saat menyimpan nama model: {e}")

## **Evaluation**

##### Step 1: Data Cleaning and Sampling (100 Random Samples)

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# === Load dataset mentah ===
print("Loading dataset...")
df = pd.read_csv('../data/scraped_articles.csv')

print(f"Sebelum cleaning: {len(df)} baris")

# === Pastikan id_dokumen tidak hilang ===
if 'id_dokumen' not in df.columns:
    raise ValueError("ERROR: Dataset tidak punya kolom 'id_dokumen'. Harus ada untuk IR!")

# === Buang baris tanpa konten ===
df = df[df['konten'].notna() & (df['konten'].str.strip() != '')]
df = df[df['konten'].apply(lambda x: len(str(x).strip()) > 50)]

print(f"Setelah drop missing/konten pendek: {len(df)} baris")

# === Buang duplikat berdasarkan URL jika ada, kalau tidak berdasarkan konten ===
if 'url' in df.columns:
    df.drop_duplicates(subset=['url'], keep='first', inplace=True)
else:
    df.drop_duplicates(subset=['konten'], keep='first', inplace=True)

print(f"Setelah drop duplikat: {len(df)} baris")

# === Fungsi bersihkan HTML secara agresif ===
def clean_html(text):
    if not isinstance(text, str):
        return ""
    # Parsing normal
    soup = BeautifulSoup(text, "html.parser")
    # Hapus script & style
    for tag in soup(["script", "style"]):
        tag.decompose()
    # Ambil teks
    text = soup.get_text(separator=" ")
    # Hapus whitespace berlebihan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Membersihkan HTML...")
df['konten_clean'] = df['konten'].apply(clean_html)
df['judul_clean'] = df['judul'].apply(clean_html) if 'judul' in df.columns else ""

# === Buang judul kosong jika diinginkan (opsional) ===
df = df[df['judul_clean'].str.strip() != ""]

# === Simpan hanya kolom penting & ID lama ===
final_df = df[['id_dokumen', 'judul_clean', 'konten_clean']]

# === Simpan ===
output_path_csv = '../evaluation/df_corpus_clean.csv'
final_df.to_csv(output_path_csv, index=False, encoding='utf-8')

print("\n=== SELESAI CLEANING! ===")
print("Dataset disimpan:", output_path_csv)
print("Jumlah dokumen final:", len(final_df))
print(final_df.head())


Loading dataset...
Sebelum cleaning: 2029 baris
Setelah drop missing/konten pendek: 2020 baris
Setelah drop duplikat: 1822 baris
Membersihkan HTML...

=== SELESAI CLEANING! ===
Dataset disimpan: ../evaluation/df_corpus_clean.csv
Jumlah dokumen final: 1822
  id_dokumen                                        judul_clean  \
1  doc_00002  Microsoft Investigasi Penyebab Masalah SSD, Be...   
2  doc_00003  AMD Klarifikasi Masalah Socket AM5 Terbakar, S...   
3  doc_00004  Gamescom 2025 ‚Äî Detail Trailer Fallout Season ...   
4  doc_00005  Kisah Clair Obscur Expedition 33, Mulai dari A...   
5  doc_00006  Developer Larang Konten Leak Infinity Nikki Di...   

                                        konten_clean  
1  Beberapa hari lalu, permasalahan SSD rusak men...  
2  AMD akhirnya buka suara mengenai socket proses...  
3  Detail Trailer Fallout Season 2 ‚Äì Trailer TV S...  
4  Kisah Clair Obscur: Expedition 33 ‚Äî Game perda...  
5  Leak Infinity Nikki ‚Äì Pada bulan Mei 2025 lalu...  


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../evaluation/df_corpus_clean.csv")

def sample_random_subset(df, n=50, random_state=42):
    np.random.seed(random_state)
    sample_df = df.sample(n=n)
    sample_df = sample_df[['id_dokumen', 'judul_clean', 'konten_clean']]
    return sample_df

sample_berita = sample_random_subset(df, n=100)
sample_berita.to_csv('../evaluation/sample_berita_100.csv', index=False)
sample_berita.head(10)

Unnamed: 0,id_dokumen,judul_clean,konten_clean
555,doc_00557,Film Street Fighter Umumkan Cast dan Tanggal T...,Saatnya masuk ke arena pertarungan! Legendary ...
1741,doc_01751,Preview Hitman ‚Äì Absolution: Kesan Pertama yan...,Setelah sempat terlupakan eksistensinya dari i...
297,doc_00497,Aplikasi Ini Bantu Migrasi Data dari Windows 1...,"Meski sudah mulai banyak digunakan, nyatanya t..."
733,doc_00735,BNPT Sebut Kelompok Radikal Rekrut Anak Muda L...,Perilaku kejahatan dan terorisme dapat terjadi...
910,doc_00920,Rekomendasi Spesifikasi Smartphone Untuk Delta...,Indogamers.com - Delta Force Mobile merupakan ...
629,doc_00631,"Marvel Tokon Akan Hadir di TGS 2025, Akan Pame...","Sejak pertama kali diumumkan, Marvel Tokon: Fi..."
1034,doc_01044,"Panduan Jalur Top Lane di Honor of Kings, Stra...","Indogamers.com - Top Lane, atau sering disebut..."
609,doc_00611,Rockstar Jamin Perilisan GTA VI Akan Jadi Peri...,Antusiasme terhadap Grand Theft Auto VI (GTA 6...
678,doc_00680,[TGS 2025] Interview Ry≈çsuke Horii Director Ya...,Yakuza Kiwami 3 jadi game yang dinanti banyak ...
602,doc_00604,Petualangan Klasik Kembali! DRAGON QUEST VII: ...,"Penggemar serial DRAGON QUEST di Indonesia, be..."


##### Step 2: Generate Query using LLM

In [3]:
import os
import google.generativeai as genai
import pandas as pd
import json
import time
from dotenv import load_dotenv

# Load API Key
load_dotenv(override=True)
api_key = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=api_key)

model = genai.GenerativeModel('gemini-2.0-flash-lite')

generation_config = genai.types.GenerationConfig(
    temperature=0.7,
    max_output_tokens=2048, 
)

safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
]

# Baca CSV
try:
    df = pd.read_csv('../evaluation/sample_berita_100.csv')
except:
    print("File CSV tidak ada, membuat dummy data...")
    df = pd.DataFrame({'id_dokumen':['d1'], 'judul_clean':['Tes'], 'konten_clean':['Isi tes']})

queries_json = []
qid_counter = 1

print("Mulai generate queries (Mode: High Token & Retry)...")

for idx, row in df.iterrows():
    judul = row.get('judul_clean', '')
    konten = row.get('konten_clean', '')
    
    if not judul or not konten:
        continue

    prompt_text = (
        f"Context: Berita game berjudul '{judul}' dan kontennya {konten}.\n"
        f"Task: Buatkan total 4 query pencarian singkat yang manusia biasa mungkin gunakan untuk mencari berita ini.\n"
        f"Aturan: Jangan salin judul secara utuh, variasikan tipe query (1 buah query literal (pakai kata kunci yang sangat mirip dengan judul), 1 buah query parafrasa yang menggunakan sinonim atau deskripsi singkat, 1 buah query yang mengandung typo, singkatan, atau ejaan tidak baku, 1 buah query yang bersifat informatif/pertanyaan (bentuk apa, kapan, benarkah, dll)), query harus alami seperti pencarian manusia, tidak terlalu panjang (maks 6 kata)"
        f"Format: Hanya list teks query, tanpa nomor, pisahkan dengan baris baru."
    )

    max_retries = 3
    success = False
    
    for attempt in range(max_retries):
        try:
            response = model.generate_content(
                prompt_text,
                generation_config=generation_config,
                safety_settings=safety_settings
            )
            
            if response.candidates and response.candidates[0].content.parts:
                # Ambil teks langsung dari parts
                raw_text = response.candidates[0].content.parts[0].text
                
                # Proses teks
                lines = [line.strip() for line in raw_text.split("\n") if line.strip()]
                for q in lines:
                    q_clean = q.lstrip('1234567890.- ').strip()
                    if q_clean: # Pastikan tidak string kosong
                        queries_json.append({
                            "qid": f"q{qid_counter}",
                            "query_text": q_clean,
                            "source_doc_id": row['id_dokumen']
                        })
                        qid_counter += 1
                
                print(f"‚úÖ Berhasil doc index {idx}")
                success = True
                break 
            
            else:
                print(f"‚ö†Ô∏è Percobaan {attempt+1}: Kosong. Reason: {response.candidates[0].finish_reason}")
        
        except Exception as e:
            print(f"‚ùå Percobaan {attempt+1} Gagal (Index {idx}): {e}")
            time.sleep(5)

    if not success:
        print(f"üíÄ GAGAL TOTAL index {idx} setelah {max_retries} kali coba.")

    # Delay antar dokumen (bukan antar retry)
    time.sleep(3) 

# Simpan hasil
with open('../evaluation/generated_queries.json', 'w', encoding='utf-8') as f:
    json.dump(queries_json, f, ensure_ascii=False, indent=2)

print(f"\nSelesai! Total query: {len(queries_json)}")

Mulai generate queries (Mode: High Token & Retry)...
‚úÖ Berhasil doc index 0
‚úÖ Berhasil doc index 1
‚úÖ Berhasil doc index 2
‚úÖ Berhasil doc index 3
‚úÖ Berhasil doc index 4
‚úÖ Berhasil doc index 5
‚úÖ Berhasil doc index 6
‚úÖ Berhasil doc index 7
‚úÖ Berhasil doc index 8
‚úÖ Berhasil doc index 9
‚úÖ Berhasil doc index 10
‚úÖ Berhasil doc index 11
‚úÖ Berhasil doc index 12
‚úÖ Berhasil doc index 13
‚úÖ Berhasil doc index 14
‚úÖ Berhasil doc index 15
‚úÖ Berhasil doc index 16
‚úÖ Berhasil doc index 17
‚úÖ Berhasil doc index 18
‚úÖ Berhasil doc index 19
‚úÖ Berhasil doc index 20
‚úÖ Berhasil doc index 21
‚úÖ Berhasil doc index 22
‚úÖ Berhasil doc index 23
‚úÖ Berhasil doc index 24
‚úÖ Berhasil doc index 25
‚úÖ Berhasil doc index 26
‚úÖ Berhasil doc index 27
‚úÖ Berhasil doc index 28
‚úÖ Berhasil doc index 29
‚úÖ Berhasil doc index 30
‚úÖ Berhasil doc index 31
‚úÖ Berhasil doc index 32
‚úÖ Berhasil doc index 33
‚úÖ Berhasil doc index 34
‚úÖ Berhasil doc index 35
‚úÖ Berhasil doc inde

##### Step 3: Run Retrieval Algorithm 

In [4]:
import pandas as pd
import numpy as np
import pickle
import os
import json
import torch
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# ==========================================
# 1. KONFIGURASI PATH
# ==========================================
MODEL_DIR = "../models/"
DATA_DIR = "../data/"
EVAL_DIR = "../evaluation/"

# Path Model
BM25_MODEL_PATH = os.path.join(MODEL_DIR, 'bm25_model.pkl')
CORPUS_DF_PATH = os.path.join(MODEL_DIR, 'df_corpus.pkl') # Dataframe referensi
SBERT_MODEL_PATH = os.path.join(MODEL_DIR, 'sbert_model.pkl')
SBERT_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'sbert_embeddings.npy')

# Path Helper Preprocessing (Sesuai kode kamu)
WHITELIST_FILE_PATH = os.path.join(DATA_DIR, "whitelist.txt")
FINAL_STOPLIST_FILE_PATH = os.path.join(DATA_DIR, "final_stopwords.txt")

# Input & Output
QUERY_FILE = os.path.join(EVAL_DIR, "generated_queries.json")
OUTPUT_FILE = os.path.join(EVAL_DIR, "retrieval_pool.json")

# ==========================================
# 2. SETUP PREPROCESSING (COPY DARI KODEMU)
# ==========================================
print("Menyiapkan Preprocessing...")

stemmer = StemmerFactory().create_stemmer()

def load_words_set(file_path):
    """Memuat set kata dari file teks."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f if line.strip())
    except FileNotFoundError:
        print(f"‚ö†Ô∏è Peringatan: File {file_path} tidak ditemukan. Menggunakan set kosong.")
        return set()

# Load Stopwords & Whitelist
WHITELIST = load_words_set(WHITELIST_FILE_PATH)
FINAL_STOPLIST = load_words_set(FINAL_STOPLIST_FILE_PATH)

def preprocess_text_for_bm25(text):
    """
    Fungsi ini SAMA PERSIS dengan yang kamu pakai saat indexing.
    Digunakan khusus untuk query yang masuk ke BM25.
    """
    if not isinstance(text, str): return ""
    
    # Cleaning dasar
    text = re.sub(r'<[^>]*>', ' ', text)
    text = text.lower()
    text = text.replace("-", "")
    text = re.sub(r'[^\w\s]',' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = text.split()

    # Filter Stopwords
    tokens_filtered = [word for word in tokens if word not in FINAL_STOPLIST]

    stemmed_tokens = []
    for word in tokens_filtered:
        # Logika Stemming Selektif (Sesuai kodemu)
        if any(char.isalpha() for char in word) and any(char.isdigit() for char in word):
            stemmed_tokens.append(word)
            continue
            
        if word.isdigit():
            stemmed_tokens.append(word)
            continue
            
        if word in WHITELIST:
            stemmed_tokens.append(word)
            continue 
        
        stemmed_tokens.append(stemmer.stem(word))
    
    return ' '.join(stemmed_tokens)

# ==========================================
# 3. LOAD MODEL & DATA
# ==========================================
print("\nLoading Model & Resources...")

# A. Load Queries
with open(QUERY_FILE, 'r', encoding='utf-8') as f:
    queries_data = json.load(f)
print(f"‚úÖ Loaded {len(queries_data)} queries.")

# B. Load DataFrame Corpus (Untuk Mapping ID)
try:
    df_corpus = pd.read_pickle(CORPUS_DF_PATH)
    df_corpus = df_corpus.reset_index(drop=True)
    all_doc_ids = df_corpus['id_dokumen'].tolist()
    print(f"‚úÖ Loaded Corpus DataFrame: {len(df_corpus)} docs.")
except Exception as e:
    raise FileNotFoundError(f"Gagal load {CORPUS_DF_PATH}. Error: {e}")

# C. Load BM25
with open(BM25_MODEL_PATH, "rb") as f:
    bm25 = pickle.load(f)
print("‚úÖ Loaded BM25 Model.")

# D. Load S-BERT Embeddings
try:
    corpus_embeddings = np.load(SBERT_EMBEDDINGS_PATH)
    corpus_embeddings = torch.from_numpy(corpus_embeddings)
    print(f"‚úÖ Loaded S-BERT Embeddings shape: {corpus_embeddings.shape}")
except Exception as e:
    raise FileNotFoundError(f"Gagal load embedding. Error: {e}")

# E. Load S-BERT Model
with open(SBERT_MODEL_PATH, 'rb') as f:
    model_name = pickle.load(f) # Biasanya string nama modelnya
sbert_model = SentenceTransformer(model_name)
print(f"‚úÖ Loaded S-BERT Model: {model_name}")

# ==========================================
# 4. LOOP RETRIEVAL & POOLING
# ==========================================

retrieval_pool = {} 
TOP_K = 20 # Ambil Top-20 dari masing-masing metode

print(f"\nMemulai Retrieval Top-{TOP_K}...")

for i, q_item in enumerate(queries_data):
    qid = q_item['qid']
    raw_query = q_item['query_text'] # Query asli manusia (untuk S-BERT)
    
    # --- A. RETRIEVAL BM25 (Pakai Preprocessing) ---
    bm25_query_text = preprocess_text_for_bm25(raw_query)
    # 2. Split token
    tokenized_query = bm25_query_text.split()
    # 3. Get Scores & Sort
    doc_scores = bm25.get_scores(tokenized_query)
    top_n_indices = np.argsort(doc_scores)[::-1][:TOP_K]
    # 4. Map ke ID Dokumen
    bm25_doc_ids = [all_doc_ids[idx] for idx in top_n_indices]

    # --- B. RETRIEVAL S-BERT (Pakai Raw Query) ---
    query_emb = sbert_model.encode(raw_query, convert_to_tensor=True)
    hits = util.semantic_search(query_emb, corpus_embeddings, top_k=TOP_K)[0]
    sbert_doc_ids = [all_doc_ids[hit['corpus_id']] for hit in hits]

    # --- C. POOLING ---
    combined_candidates = list(set(bm25_doc_ids + sbert_doc_ids))
    
    retrieval_pool[qid] = {
        "query_text": raw_query,
        "processed_query_bm25": bm25_query_text, # Debugging info
        "source_doc_id": q_item['source_doc_id'],
        "candidates": combined_candidates
    }
    
    if (i+1) % 20 == 0:
        print(f"Processed {i+1} queries...")

# ==========================================
# 5. SIMPAN HASIL
# ==========================================

with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(retrieval_pool, f, ensure_ascii=False, indent=2)

print(f"\n‚úÖ Selesai! Hasil pooling disimpan di '{OUTPUT_FILE}'")
print(f"Total Query: {len(retrieval_pool)}")

Menyiapkan Preprocessing...

Loading Model & Resources...
‚úÖ Loaded 401 queries.
‚úÖ Loaded Corpus DataFrame: 1822 docs.
‚úÖ Loaded BM25 Model.
‚úÖ Loaded S-BERT Embeddings shape: torch.Size([1822, 768])


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: c1c842b6-5d61-4ce3-9267-c4d07f3977ce)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


‚úÖ Loaded S-BERT Model: paraphrase-multilingual-mpnet-base-v2

Memulai Retrieval Top-20...
Processed 20 queries...
Processed 40 queries...
Processed 60 queries...
Processed 80 queries...
Processed 100 queries...
Processed 120 queries...
Processed 140 queries...
Processed 160 queries...
Processed 180 queries...
Processed 200 queries...
Processed 220 queries...
Processed 240 queries...
Processed 260 queries...
Processed 280 queries...
Processed 300 queries...
Processed 320 queries...
Processed 340 queries...
Processed 360 queries...
Processed 380 queries...
Processed 400 queries...

‚úÖ Selesai! Hasil pooling disimpan di '../evaluation/retrieval_pool.json'
Total Query: 401


##### Step 4: Judging and Calculate Evaluation

In [5]:
import pandas as pd
import numpy as np
import pickle
import os
import json
import torch
import re
from tqdm.auto import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# ==========================================
# 1. KONFIGURASI PATH
# ==========================================
MODEL_DIR = "../models/"
DATA_DIR = "../data/"
EVAL_DIR = "../evaluation/"

# Path Data & Model
BM25_MODEL_PATH = os.path.join(MODEL_DIR, 'bm25_model.pkl')
CORPUS_DF_PATH = os.path.join(MODEL_DIR, 'df_corpus.pkl')
SBERT_MODEL_PATH = os.path.join(MODEL_DIR, 'sbert_model.pkl')
SBERT_EMBEDDINGS_PATH = os.path.join(MODEL_DIR, 'sbert_embeddings.npy')

# Helper Preprocessing
WHITELIST_FILE_PATH = os.path.join(DATA_DIR, "whitelist.txt")
FINAL_STOPLIST_FILE_PATH = os.path.join(DATA_DIR, "final_stopwords.txt")

# Input & Output
POOL_FILE = os.path.join(EVAL_DIR, 'retrieval_pool.json')        
GROUND_TRUTH_FILE = os.path.join(EVAL_DIR, 'ground_truth.json')    
FINAL_REPORT_FILE = os.path.join(EVAL_DIR, 'eval_result.txt')    

# ==========================================
# 2. SETUP PREPROCESSING (Harus ada untuk BM25)
# ==========================================
print("Menyiapkan Preprocessing...")
stemmer = StemmerFactory().create_stemmer()

def load_words_set(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return set(line.strip() for line in f if line.strip())
    except: return set()

WHITELIST = load_words_set(WHITELIST_FILE_PATH)
FINAL_STOPLIST = load_words_set(FINAL_STOPLIST_FILE_PATH)

def preprocess_text_for_bm25(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'<[^>]*>', ' ', text)
    text = text.lower().replace("-", "")
    text = re.sub(r'[^\w\s]',' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens_filtered = [word for word in tokens if word not in FINAL_STOPLIST]
    
    stemmed_tokens = []
    for word in tokens_filtered:
        if any(char.isalpha() for char in word) and any(char.isdigit() for char in word):
            stemmed_tokens.append(word)
        elif word.isdigit() or word in WHITELIST:
            stemmed_tokens.append(word)
        else:
            stemmed_tokens.append(stemmer.stem(word))
    return ' '.join(stemmed_tokens)

# ==========================================
# 3. LOAD DATA & MODEL
# ==========================================
print("\nLoading Resources...")

# Load Corpus
df_corpus = pd.read_pickle(CORPUS_DF_PATH).reset_index(drop=True)
all_doc_ids = df_corpus['id_dokumen'].tolist()
df_corpus['full_text'] = df_corpus['judul'].fillna('') + ". " + df_corpus['konten'].fillna('')
docs_map = pd.Series(df_corpus.full_text.values, index=df_corpus.id_dokumen).to_dict()

# Load Models
with open(BM25_MODEL_PATH, "rb") as f:
    bm25 = pickle.load(f)

corpus_embeddings = torch.from_numpy(np.load(SBERT_EMBEDDINGS_PATH))

with open(SBERT_MODEL_PATH, 'rb') as f:
    sbert_name = pickle.load(f)
sbert_model = SentenceTransformer(sbert_name)

# Load Cross-Encoder (Juri)
print("Loading Cross-Encoder (Juri Evaluasi)...")
judge_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Load Retrieval Pool
with open(POOL_FILE, 'r', encoding='utf-8') as f:
    pool_data = json.load(f)

# ==========================================
# 4. TAHAP A: JUDGING (MEMBUAT GROUND TRUTH)
# ==========================================
print(f"\n--- MULAI TAHAP JUDGING ({len(pool_data)} queries) ---")
ground_truth = {}

for q_id, data in tqdm(pool_data.items(), desc="Judging Candidates"):
    query = data['query_text']
    candidates = data['candidates'] # List ID dokumen
    source_doc = data['source_doc_id']
    
    # pasangan [Query, Dokumen] untuk dinilai
    pairs = []
    valid_candidates = []
    
    for doc_id in candidates:
        if doc_id in docs_map:
            doc_text = docs_map[doc_id][:1000]
            pairs.append([query, doc_text])
            valid_candidates.append(doc_id)
            
    if not pairs:
        ground_truth[q_id] = [source_doc]
        continue

    # Prediksi Skor
    scores = judge_model.predict(pairs)
    
    # Filter Relevan (Threshold > 0.5)
    relevant_docs = []
    for doc_id, score in zip(valid_candidates, scores):
        if score > 0.5:
            relevant_docs.append(doc_id)
            
    # FORCE INCLUDE SOURCE DOC
    # Dokumen asal pembuatan query WAJIB dianggap relevan (Known-Item assumption)
    if source_doc not in relevant_docs:
        relevant_docs.append(source_doc)
        
    ground_truth[q_id] = list(set(relevant_docs))

# Simpan Ground Truth
with open(GROUND_TRUTH_FILE, 'w') as f:
    json.dump(ground_truth, f, indent=2)
print(f"Ground Truth tersimpan di {GROUND_TRUTH_FILE}")

# ==========================================
# 5. TAHAP B: EVALUASI METRIK
# ==========================================
print("\n--- MULAI PERHITUNGAN METRIK ---")

def calculate_metrics(retrieved_ids, true_ids, k=10):
    """Menghitung P@k, R@k, F1@k, AP@k"""
    retrieved_k = retrieved_ids[:k]
    true_set = set(true_ids)
    
    # Hits
    hits = [1 if doc in true_set else 0 for doc in retrieved_k]
    num_hits = sum(hits)
    num_relevant = len(true_set)
    
    # Precision & Recall
    precision = num_hits / k
    recall = num_hits / num_relevant if num_relevant > 0 else 0
    
    # F1
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Average Precision (AP)
    ap = 0.0
    running_hits = 0
    for i, is_hit in enumerate(hits):
        if is_hit:
            running_hits += 1
            ap += running_hits / (i + 1)
    
    ap = ap / num_relevant if num_relevant > 0 else 0
    return precision, recall, f1, ap

# Storage hasil
metrics_bm25 = {'p': [], 'r': [], 'f1': [], 'ap': []}
metrics_sbert = {'p': [], 'r': [], 'f1': [], 'ap': []}
K = 10

for q_id, true_ids in tqdm(ground_truth.items(), desc="Calculating Metrics"):
    # Ambil teks query asli
    raw_query = pool_data[q_id]['query_text']
    
    # --- 1. JALANKAN BM25 ULANG ---
    bm25_query = preprocess_text_for_bm25(raw_query)
    bm25_tokens = bm25_query.split()
    bm25_scores = bm25.get_scores(bm25_tokens)
    bm25_top_idx = np.argsort(bm25_scores)[::-1][:K]
    bm25_res = [all_doc_ids[i] for i in bm25_top_idx]
    
    # --- 2. JALANKAN S-BERT ULANG ---
    sbert_emb = sbert_model.encode(raw_query, convert_to_tensor=True)
    hits = util.semantic_search(sbert_emb, corpus_embeddings, top_k=K)[0]
    sbert_res = [all_doc_ids[hit['corpus_id']] for hit in hits]
    
    # --- 3. HITUNG SKOR ---
    p, r, f, ap = calculate_metrics(bm25_res, true_ids, k=K)
    metrics_bm25['p'].append(p)
    metrics_bm25['r'].append(r)
    metrics_bm25['f1'].append(f)
    metrics_bm25['ap'].append(ap)
    
    p, r, f, ap = calculate_metrics(sbert_res, true_ids, k=K)
    metrics_sbert['p'].append(p)
    metrics_sbert['r'].append(r)
    metrics_sbert['f1'].append(f)
    metrics_sbert['ap'].append(ap)

# ==========================================
# 6. HASIL AKHIR
# ==========================================
result_text = f"""
=================================================
HASIL EVALUASI INFORMATION RETRIEVAL (TOP-{K})
=================================================
Jumlah Query: {len(ground_truth)}
Ground Truth Method: Cross-Encoder (MS-MARCO)

1. ALGORITMA BM25 (Lexical)
   - Mean Precision@{K} : {np.mean(metrics_bm25['p']):.4f}
   - Mean Recall@{K}    : {np.mean(metrics_bm25['r']):.4f}
   - Mean F1-Score@{K}  : {np.mean(metrics_bm25['f1']):.4f}
   - MAP (Mean AP)     : {np.mean(metrics_bm25['ap']):.4f}

2. ALGORITMA S-BERT (Semantic)
   - Mean Precision@{K} : {np.mean(metrics_sbert['p']):.4f}
   - Mean Recall@{K}    : {np.mean(metrics_sbert['r']):.4f}
   - Mean F1-Score@{K}  : {np.mean(metrics_sbert['f1']):.4f}
   - MAP (Mean AP)     : {np.mean(metrics_sbert['ap']):.4f}
=================================================
"""

print(result_text)

# Simpan ke file
with open(FINAL_REPORT_FILE, 'w') as f:
    f.write(result_text)

Menyiapkan Preprocessing...

Loading Resources...
Loading Cross-Encoder (Juri Evaluasi)...

--- MULAI TAHAP JUDGING (401 queries) ---


Judging Candidates:   0%|          | 0/401 [00:00<?, ?it/s]

Ground Truth tersimpan di ../evaluation/ground_truth.json

--- MULAI PERHITUNGAN METRIK ---


Calculating Metrics:   0%|          | 0/401 [00:00<?, ?it/s]


HASIL EVALUASI INFORMATION RETRIEVAL (TOP-10)
Jumlah Query: 401
Ground Truth Method: Cross-Encoder (MS-MARCO)

1. ALGORITMA BM25 (Lexical)
   - Mean Precision@10 : 0.2566
   - Mean Recall@10    : 0.8443
   - Mean F1-Score@10  : 0.3378
   - MAP (Mean AP)     : 0.7358

2. ALGORITMA S-BERT (Semantic)
   - Mean Precision@10 : 0.2002
   - Mean Recall@10    : 0.6596
   - Mean F1-Score@10  : 0.2616
   - MAP (Mean AP)     : 0.5005

