<a href="https://colab.research.google.com/github/anggaangoro3/data-science-project/blob/main/scrping_rumah123_selenium%40V1.3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:

#V1.2.1
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.2.1")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen

# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(60)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 22
end_page = 99

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(4, 6))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 25
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

#Baru v1.2.3
# Wait dinamis sampai minimal 20 listing muncul (gunakan WebDriverWait)
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    try:
        WebDriverWait(driver, 10).until(
            lambda d: len(d.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")) >= 20
        )
        print("All 20 listings loaded.")
    except:
        print("Timeout waiting for 20 listings. Proceeding with available.")

    print("Scrolling finished for this page.")
    #Baru end

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(2, 3)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")


Research by @Reza Anggoro
Property @scraping V1.2.1
Use it for Educational Purposes only!

This script uses Chromium Browser to crawl data from Rumah123.
Note: This script is configured to run on your local device.

Opening rumah123 search page...

Note: A Stop button window will appear. Click it to stop scraping and save data.

Starting local Chrome driver...
Driver started successfully.
Opening page 2: https://www.rumah123.com/jual/dki-jakarta/tanah/?page=2
Waiting for page layout to settle before scrolling...
Starting scroll (slower and deeper)...
-- Scrolling... (1), waiting for content...
-- Scrolling... (2), waiting for content...
-- Page height not changing, likely at bottom.
Doing one final scroll to the absolute bottom...
Waiting 3 seconds at the bottom for final loads...
Experiment: Scrolling back to top slowly...
Timeout waiting for 20 listings. Proceeding with available.
Scrolling finished for this page.
Experiment: Scrolling to middle (9512px) and pausing...
Scrolling fini

In [2]:
import pandas as pd

def read_scraped_data(filename='scraped_land_data.csv'):
    """Reads scraped data from a CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(filename)
        print(f"Successfully loaded data from '{filename}'.")
        print(f"DataFrame shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None
    except pd.errors.EmptyDataError:
        print(f"Warning: The file '{filename}' is empty or only contains headers.")
        return pd.DataFrame() # Return an empty DataFrame
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

scraped_df = read_scraped_data()

if scraped_df is not None and not scraped_df.empty:
    print("Displaying the entire scraped dataset:")
    display(scraped_df)
elif scraped_df is not None and scraped_df.empty:
    print("The dataset is empty. No data to display.")
else:
    print("Could not read the dataset.")

Successfully loaded data from 'scraped_land_data.csv'.
DataFrame shape: (132, 13)
Displaying the entire scraped dataset:


Unnamed: 0,Product_sku_0,Place_name_0,Total_Price_0,Price_per_meter_0,Land_Area_0,Certificate_0,Land_Dimensions_0,Property_Type_0,Ad_Type_0,Place_PostalAddress_addressLocality_0,BreadcrumbList_ListItem_name_2,BreadcrumbList_ListItem_name_3,Product_description_0
0,Dijual,Di Jual Tanah Kavling Dalam Cluster Pejaten Barat,Rp 15 Juta Total,,237 m² (24x10m),SHM,24x10 m,Ada,Tanah,"Pejaten, Jakarta Selatan",Jakarta Selatan,Pejaten,Dijual tanah dalam cluster posisi hook\nLt 237...
1,las3800069,Tanah Lokasi Langka Simprug Garden 7 Termurah ...,"Rp 46,5 Miliar Total",Rp 40 Juta /m²,1163 m² (38x30m),SHM,38x30 m,Tanah,Dijual,"Simprug Garden, Jakarta Selatan",Jakarta Selatan,Simprug Garden,Lokasi tanah sangat prime\nLokasi jarang ada\n...
2,las8818624,KAVLING HOEK DI TAMAN PALEM LESTARI JARANG ADA...,Rp 6 Miliar Total,"Rp 19,5 Juta /m²",308 m² (16x20m),HGB,16x20 m,Tanah,Dijual,"Taman Palem, Jakarta Barat",Jakarta Barat,Taman Palem,Kode Listing : RMTPL0035SDV\n\nLuas Tanah : 30...
3,Dijual,Jamin Tanah Termurah Terogong Pondok Indah Har...,Rp 63 Miliar Total,,2805 m² (20x140m),SHM,20x140 m,Ada,Tanah,"Terogong, Jakarta Selatan",Jakarta Selatan,Terogong,Lokasi langka pondok indah\nSelangkah ke sekol...
4,las8822748,Tanah Kosong Lokasi Jarang Ada Barang Langka R...,Rp 25 Miliar Total,,448 m²,SHM,,Tanah,Dijual,"Radio Dalam, Jakarta Selatan",Jakarta Selatan,Radio Dalam,Lokasi super keren banget\nJalan radio dalam r...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,las8966837,Murah !! Dijual Kavling Luas 204 Jalan 3 Mobil...,"Rp 4,5 Miliar Total",,204 m²,HGB,,Tanah,Dijual,"Kelapa Gading, Jakarta Utara",Jakarta Utara,Kelapa Gading,Murah !! Dijual Kavling Luas 204 Jalan 3 Mobil...
128,las3295838,"Murah Kavling Camar Elok Pik, Nego","Rp 8,6 Miliar Total",Rp 23 Juta /m²,374 m²,HGB,,Tanah,Dijual,"Pantai Indah Kapuk, Jakarta Utara",Jakarta Utara,Pantai Indah Kapuk,Jual kavling camar elok\nHook\n374m²\nUtara Timur
129,las8868866,DIJUAL 3 UNIT KAVLING SIAP BANGUN@TOSIGA KEBON...,"Rp 13,8 Miliar Total",,868 m²,SHM,,Tanah,Dijual,"Kebon Jeruk, Jakarta Barat",Jakarta Barat,Kebon Jeruk,DIJUAL KAVLING SIAP BANGUN 3 UNIT MENJADI SATU...
130,las8753075,Kavling Green Lake City Termurah Dibawah Pasar...,"Rp 3,22 Miliar Total","Rp 17,9 Juta /m²",180 m² (18x10m),PPJB,18x10 m,Tanah,Dijual,"Green Lake City, Jakarta Barat",Jakarta Barat,Green Lake City,Green lake city Dijual termurah lokasi premium...


In [None]:
# ... (kode script Anda sebelumnya)
"""
# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")
"""
# Tambahkan ini untuk hibernate otomatis (Windows)
import os
print("Scraping selesai. Memulai hibernate laptop...")
os.system("shutdown /h")  # Perintah hibernate di Windows

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")

Scraping selesai. Memulai hibernate laptop...


In [6]:
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.3")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen
#driver.set_page_load_timeout(60)
# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 105
end_page = 800

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(3, 6))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Restarting for this page...")
        driver.quit()
        time.sleep(5)  # Jeda sebelum restart
        print("Restarting driver for page {page}...")
        driver = webdriver.Chrome(options=options)
        continue  # Ulangi loop untuk halaman ini

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(2, 3)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

Research by @Reza Anggoro
Property @scraping V1.3
Use it for Educational Purposes only!

This script uses Chromium Browser to crawl data from Rumah123.
Note: This script is configured to run on your local device.

Opening rumah123 search page...

Note: A Stop button window will appear. Click it to stop scraping and save data.

Starting local Chrome driver...
Driver started successfully.
Opening page 105: https://www.rumah123.com/jual/dki-jakarta/tanah/?page=105
Waiting for page layout to settle before scrolling...
Starting scroll (slower and deeper)...
-- Scrolling... (1), waiting for content...
-- Scrolling... (2), waiting for content...
-- Page height not changing, likely at bottom.
Doing one final scroll to the absolute bottom...
Waiting 3 seconds at the bottom for final loads...
Experiment: Scrolling back to top slowly...
Experiment: Scrolling to middle (349px) and pausing...
Scrolling finished for this page.
Found 0 property links on page 105.
No valid listings found on page 105. 

In [1]:
"""
#V1.2.1
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.2.1")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen

# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(60)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 1
end_page = 99

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(4, 6))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(2, 3)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

Research by @Reza Anggoro
Property @scraping V1.2.1
Use it for Educational Purposes only!

This script uses Chromium Browser to crawl data from Rumah123.
Note: This script is configured to run on your local device.

Opening rumah123 search page...

Note: A Stop button window will appear. Click it to stop scraping and save data.

Starting local Chrome driver...
Driver started successfully.
Opening page 1: https://www.rumah123.com/jual/dki-jakarta/tanah/?page=1
Waiting for page layout to settle before scrolling...
Starting scroll (slower and deeper)...
-- Scrolling... (1), waiting for content...
-- Scrolling... (2), waiting for content...
-- Page height not changing, likely at bottom.
Doing one final scroll to the absolute bottom...
Waiting 3 seconds at the bottom for final loads...
Experiment: Scrolling back to top slowly...
Experiment: Scrolling to middle (10540px) and pausing...
Scrolling finished for this page.
Found 20 property links on page 1.
Opening detail page: https://www.rumah

In [None]:
"""
#V1.2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.2")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen
#driver.set_page_load_timeout(60)
# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 88
end_page = 120

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(3, 5))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(2, 3)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

In [None]:
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.3")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen
#driver.set_page_load_timeout(60)
# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 104
end_page = 750

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(5, 7))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(1, 2)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

Research by @Reza Anggoro
Property @scraping V1.2
Use it for Educational Purposes only!

This script uses Chromium Browser to crawl data from Rumah123.
Note: This script is configured to run on your local device.

Opening rumah123 search page...

Note: A Stop button window will appear. Click it to stop scraping and save data.

Starting local Chrome driver...
Driver started successfully.
Opening page 104: https://www.rumah123.com/jual/dki-jakarta/tanah/?page=104
Waiting for page layout to settle before scrolling...
Starting scroll (slower and deeper)...
-- Scrolling... (1), waiting for content...
-- Page height not changing, likely at bottom.
Doing one final scroll to the absolute bottom...
Waiting 3 seconds at the bottom for final loads...
Experiment: Scrolling back to top slowly...
Experiment: Scrolling to middle (349px) and pausing...
Scrolling finished for this page.
Found 0 property links on page 104.
No valid listings found on page 104. Stopping.
Scraping completed (or stopped). D

In [None]:
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.2")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen
#driver.set_page_load_timeout(60)
# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 101
end_page = 750

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(3, 6))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 25 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(25)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")

        try:
            # Coba buka halaman
            driver.get(href)

        except TimeoutException:
            # Jika halaman gagal dimuat dalam 60 detik
            print(f"!!! PAGE TIMEOUT: Halaman {href} terlalu lama dimuat. Melewati...")
            # 'continue' akan mengabaikan sisa kode di loop ini
            # dan langsung lanjut ke 'href' berikutnya
            continue

        except Exception as e:
            # Menangkap error lain jika terjadi
            print(f"!!! ERROR LAIN saat membuka {href}: {e}. Melewati...")
            continue

        # Kode ini hanya akan berjalan JIKA 'driver.get()' BERHASIL
        time.sleep(random.uniform(3, 6))
        # ... (sisa kode Anda untuk cek CAPTCHA dan extract data) ...
        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Tambahkan scroll sedikit pada page detail (2-3 kali)
        detail_scroll_count = random.randint(2, 3)
        for i in range(detail_scroll_count):
            scroll_amount = random.randint(200, 400)
            driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
            time.sleep(random.uniform(0.5, 1.0))

        # Tambahkan waktu sedikit untuk memastikan seluruh nilai muncul
        time.sleep(random.uniform(2, 3))

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
            expand_clicked = True
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        max_retries = 3
        retry_count = 0
        while retry_count < max_retries:
            data = {}
            try:
                data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
            except:
                data['Product_sku_0'] = ''

            try:
                data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
            except:
                data['Place_name_0'] = ''

            try:
                data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
            except:
                data['Total_Price_0'] = ''

            try:
                data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
            except:
                data['Price_per_meter_0'] = ''

            try:
                data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
            except:
                data['Land_Area_0'] = ''

            try:
                data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
            except:
                data['Certificate_0'] = ''

            try:
                data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
            except:
                data['Land_Dimensions_0'] = ''

            try:
                data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
            except:
                data['Property_Type_0'] = ''

            try:
                data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
            except:
                data['Ad_Type_0'] = ''

            try:
                data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
            except:
                data['Place_PostalAddress_addressLocality_0'] = ''

            try:
                data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
            except:
                data['BreadcrumbList_ListItem_name_2'] = ''

            try:
                data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
            except:
                data['BreadcrumbList_ListItem_name_3'] = ''

            try:
                data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
            except:
                data['Product_description_0'] = ''

            # Peningkatan 1: Cek jika CAPTCHA terdeteksi berdasarkan 'Place_name_0'
            if data['Place_name_0'] == 'www.rumah123.com':
                print(f"!!! CAPTCHA DETECTED via data check on detail page {href}. !!!")
                print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
                print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
                time.sleep(20)
                print("Waktu jeda selesai, reloading detail page...")
                driver.get(href)
                time.sleep(random.uniform(3, 6))
                retry_count += 1
                continue  # Ekstrak ulang
            else:
                break  # Sukses, lanjut

        if retry_count == max_retries:
            print(f"Max retries reached for {href}. Proceeding with possibly incomplete data.")

        # Peningkatan 2: Koreksi inkonsistensi jika expand gagal
        if not expand_clicked:
            # Shift values based on observed mismatch
            temp_sku = data['Ad_Type_0']
            temp_ad_type = data['Property_Type_0']
            temp_property_type = data['Land_Dimensions_0']
            # Assume Land_Dimensions not available or parse from description if needed
            data['Product_sku_0'] = temp_sku
            data['Ad_Type_0'] = temp_ad_type
            data['Property_Type_0'] = temp_property_type
            data['Land_Dimensions_0'] = ''  # Or extract from description if possible

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

Research by @Reza Anggoro
Property @scraping V1.2
Use it for Educational Purposes only!

This script uses Chromium Browser to crawl data from Rumah123.
Note: This script is configured to run on your local device.

Opening rumah123 search page...

Note: A Stop button window will appear. Click it to stop scraping and save data.

Starting local Chrome driver...
Driver started successfully.
Opening page 101: https://www.rumah123.com/jual/dki-jakarta/tanah/?page=101
Waiting for page layout to settle before scrolling...
Starting scroll (slower and deeper)...
-- Scrolling... (1), waiting for content...
-- Scrolling... (2), waiting for content...
-- Page height not changing, likely at bottom.
Doing one final scroll to the absolute bottom...
Waiting 3 seconds at the bottom for final loads...
Experiment: Scrolling back to top slowly...
Experiment: Scrolling to middle (9659px) and pausing...
Scrolling finished for this page.
Found 15 property links on page 101.
Opening detail page: https://www.ru

OLD Version V1

In [None]:
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os
import threading
import tkinter as tk
from tkinter import messagebox

# --- Pesan Perkenalan ---
print("Research by @Reza Anggoro")
print("Property @scraping V1.0")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")
print("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")

# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen

# --- Menambahkan User Agent Acak ---
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# --- Inisialisasi Driver Chrome ---
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List untuk menyimpan data hasil scrape
data_list = []

# Path file CSV
csv_file = './scraped_land_data.csv'

# --- Pengecekan File CSV yang Sudah Ada ---
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# URL dasar
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Rentang halaman yang akan di-scrape
start_page = 81
end_page = 120

# Flag untuk stop scraping
stop_scraping = False

# Fungsi untuk window tombol stop (jalan di thread terpisah)
def stop_button_window():
    global stop_scraping
    root = tk.Tk()
    root.title("Scraping Control")
    root.geometry("200x100")

    def on_stop():
        global stop_scraping
        stop_scraping = True
        messagebox.showinfo("Stop", "Scraping will stop after current operation.")
        root.destroy()

    button = tk.Button(root, text="Stop Scraping", command=on_stop)
    button.pack(pady=20)
    root.mainloop()

# Jalankan window stop di thread background
threading.Thread(target=stop_button_window, daemon=True).start()

# --- Loop Halaman Utama (PAGINATION) ---
for page in range(start_page, end_page + 1):
    if stop_scraping:
        break

    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Jeda acak untuk memuat halaman
    time.sleep(random.uniform(3, 6))

    # --- Pengecekan CAPTCHA yang Terlihat ---
    captcha_detected = False
    try:
        captcha_element = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        if captcha_element.is_displayed():
            captcha_detected = True
    except NoSuchElementException:
        pass

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(20)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # --- Blok untuk Scrolling ---
    print("Waiting for page layout to settle before scrolling...")
    time.sleep(1.5)

    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50
    print("Starting scroll (slower and deeper)...")

    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        if stop_scraping:
            break

        scroll_amount = random.randint(400, 700)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")

        print(f"-- Scrolling... ({scroll_count}), waiting for content...")
        time.sleep(random.uniform(1.5, 2.2))

        current_scroll += scroll_amount
        scroll_count += 1

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == total_height:
            print("-- Page height not changing, likely at bottom.")
            break
        total_height = new_height

    if stop_scraping:
        break

    print("Doing one final scroll to the absolute bottom...")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    print("Waiting 3 seconds at the bottom for final loads...")
    time.sleep(3.0)

    print("Experiment: Scrolling back to top slowly...")
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(1.5, 2.5))

    mid_height = total_height // 2
    print(f"Experiment: Scrolling to middle ({mid_height}px) and pausing...")
    driver.execute_script(f"window.scrollTo(0, {mid_height});")
    time.sleep(random.uniform(2.0, 3.0))

    print("Scrolling finished for this page.")

    # --- Ambil semua link (href) dari halaman daftar ---
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    # --- Loop Halaman Detail (per link) ---
    for href in href_list:
        if stop_scraping:
            break

        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")
        driver.get(href)
        time.sleep(random.uniform(3, 6))

        # Cek VISIBLE CAPTCHA di halaman detail
        captcha_detected = False
        try:
            captcha_element_detail = driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            if captcha_element_detail.is_displayed():
                captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Coba klik tombol "Muat lebih banyak" (jika ada)
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            print("Successfully clicked 'Muat lebih banyak' button.")
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed. Proceeding without expansion.")

        # --- Blok Ekstraksi Data (DIPERBAIKI: Tambah try-except lengkap) ---
        data = {}
        try:
            data['Product_sku_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(4) p.text-sm").text
        except:
            data['Product_sku_0'] = ''

        try:
            data['Place_name_0'] = driver.find_element(By.TAG_NAME, "h1").text
        except:
            data['Place_name_0'] = ''

        try:
            data['Total_Price_0'] = driver.find_element(By.CSS_SELECTOR, "span.text-primary").text
        except:
            data['Total_Price_0'] = ''

        try:
            data['Price_per_meter_0'] = driver.find_element(By.CSS_SELECTOR, "span.font-normal.text-sm").text
        except:
            data['Price_per_meter_0'] = ''

        try:
            data['Land_Area_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex-col:nth-of-type(1) p.text-gray-800").text
        except:
            data['Land_Area_0'] = ''

        try:
            data['Certificate_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.text-base").text
        except:
            data['Certificate_0'] = ''

        try:
            data['Land_Dimensions_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(1) p.font-medium.text-sm").text
        except:
            data['Land_Dimensions_0'] = ''

        try:
            data['Property_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.flex:nth-of-type(2) p.font-medium.text-sm").text
        except:
            data['Property_Type_0'] = ''

        try:
            data['Ad_Type_0'] = driver.find_element(By.CSS_SELECTOR, "div.items-center:nth-of-type(3) p.text-sm").text
        except:
            data['Ad_Type_0'] = ''

        try:
            data['Place_PostalAddress_addressLocality_0'] = driver.find_element(By.CSS_SELECTOR, "p.mb-2").text
        except:
            data['Place_PostalAddress_addressLocality_0'] = ''

        try:
            data['BreadcrumbList_ListItem_name_2'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(4) a").text
        except:
            data['BreadcrumbList_ListItem_name_2'] = ''

        try:
            data['BreadcrumbList_ListItem_name_3'] = driver.find_element(By.CSS_SELECTOR, ".flex div:nth-of-type(5) a").text
        except:
            data['BreadcrumbList_ListItem_name_3'] = ''

        try:
            data['Product_description_0'] = driver.find_element(By.CSS_SELECTOR, "p.font-light.text-sm").text
        except:
            data['Product_description_0'] = ''

        # Print data untuk debug
        print(f"Extracted data: {data}")

        data_list.append(data)

        # Simpan data ke CSV secara bertahap
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        # Jeda sebelum kembali ke halaman daftar
        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    # Jeda antar halaman
    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# --- Selesai ---
driver.quit()

# Simpan data terakhir jika belum
if data_list:
    df = pd.DataFrame(data_list)
    df.to_csv(csv_file, index=False)
    print(f"Final data saved to: {csv_file}")
    print(f"Total items saved: {len(data_list)}")

print("Scraping completed (or stopped). Data saved to 'scraped_land_data.csv'.")
"""

'\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.common.action_chains import ActionChains\nfrom selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\nimport time\nimport random\nimport pandas as pd\nimport os\nimport threading\nimport tkinter as tk\nfrom tkinter import messagebox\n\n# --- Pesan Perkenalan ---\nprint("Research by @Reza Anggoro")\nprint("Property @scraping V1.0")\nprint("Use it for Educational Purposes only!")\nprint("\nThis script uses Chromium Browser to crawl data from Rumah123.")\nprint("Note: This script is configured to run on your local device.")\nprint("\nOpening rumah123 search page...\n")\nprint("Note: A Stop button window will appear. Click it to stop scraping and save data.\n")\n\n# --- Pengaturan Opsi Chrome untuk LOCAL (NON-HEADLESS) ---\noptions = webdriver.ChromeOptions()\noptions.add_argument("start-maximized")  # Membuka browser fullscreen\n\n# --- Menambahkan U