<a href="https://colab.research.google.com/github/anggaangoro3/data-science-project/blob/main/Scrping_Rumah123_Selenium.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Chrome and Chromedriver
!apt-get update
!apt-get install -y chromium-browser chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin/chromedriver

print("Chrome and Chromedriver installed and configured.")

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import random
import pandas as pd
import os

# Intro messages
print("Research by Grok")
print("Use it for Educational Purposes only!")
print("\nThis script uses Chromium Browser to crawl data from Rumah123.")
print("Note: This script is configured to run on your local device.")
print("\nOpening rumah123 search page...\n")

# Set up Chrome options for LOCAL (NON-HEADLESS)
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")  # Membuka browser fullscreen

# Add a random user agent
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
]
options.add_argument(f'user-agent={random.choice(user_agents)}')

# Inisialisasi driver
print("Starting local Chrome driver...")
driver = webdriver.Chrome(options=options)
print("Driver started successfully.")

# List to store scraped data
data_list = []

# CSV file path
csv_file = './scraped_land_data.csv'

# Check if file exists, rename to .old.csv
if os.path.exists(csv_file):
    old_file = csv_file.replace('.csv', '.old.csv')
    try:
        os.rename(csv_file, old_file)
        print(f"Found existing file {csv_file}, renaming to {old_file}")
    except OSError as e:
        print(f"Could not rename file {csv_file}. It might be open. Error: {e}")
        print("Exiting to prevent data loss.")
        driver.quit()
        exit()

# Base URL
base_url = "https://www.rumah123.com/jual/dki-jakarta/tanah/?page="

# Pages from 1 to 5
start_page = 1
end_page = 5

for page in range(start_page, end_page + 1):
    url = base_url + str(page)
    print(f"Opening page {page}: {url}")
    driver.get(url)

    # Random page loading delay
    time.sleep(random.uniform(3, 6))

    # Check for VISIBLE CAPTCHA (lebih akurat, cek iframe atau div visible)
    captcha_detected = False
    try:
        # Contoh selector: iframe recaptcha atau div challenge visible
        driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
        captcha_detected = True
    except NoSuchElementException:
        pass  # Tidak visible

    if captcha_detected:
        print(f"!!! CAPTCHA DETECTED on page {page}. !!!")
        print("!!! ANDA HARUS MENYELESAIKANNYA DI JENDELA BROWSER SEKARANG. !!!")
        print("!!! Skrip akan menjeda selama 5 DETIK agar Anda bisa menyelesaikannya. !!!")
        time.sleep(5)
        print("Waktu jeda selesai, melanjutkan skrip...")

    # Gradual scrolling
    total_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0
    scroll_count = 1
    max_scroll_attempts = 50  # Batas untuk hindari loop infinite
    print("Starting scroll...")
    while current_scroll < total_height and scroll_count < max_scroll_attempts:
        scroll_amount = random.randint(200, 500)
        driver.execute_script(f"window.scrollBy(0, {scroll_amount});")
        time.sleep(random.uniform(0.2, 0.5))
        current_scroll += scroll_amount
        scroll_count += 1
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == total_height:
            break
        total_height = new_height
    print("Scrolling finished for this page.")

    # Ambil semua href dulu
    href_list = []
    try:
        listings = driver.find_elements(By.CSS_SELECTOR, "a.gap-1.w-full")
        for listing in listings:
            try:
                href = listing.get_attribute('href')
                if href and href.startswith('https://www.rumah123.com/properti/'):
                    href_list.append(href)
            except:
                pass
        href_list = list(dict.fromkeys(href_list))  # Hapus duplikat
        print(f"Found {len(href_list)} property links on page {page}.")
    except NoSuchElementException:
        print(f"No more listings found on page {page}. Stopping.")
        break

    if not href_list:
        print(f"No valid listings found on page {page}. Stopping.")
        break

    for href in href_list:
        time.sleep(random.uniform(1, 3))
        print(f"Opening detail page: {href}")
        driver.get(href)
        time.sleep(random.uniform(3, 6))

        # Check VISIBLE CAPTCHA di detail
        captcha_detected = False
        try:
            driver.find_element(By.CSS_SELECTOR, 'iframe[src*="recaptcha"], div.g-recaptcha')
            captcha_detected = True
        except NoSuchElementException:
            pass

        if captcha_detected:
            print(f"!!! CAPTCHA DETECTED on detail page {href}. !!!")
            print("!!! Skrip akan menjeda selama 20 DETIK agar Anda bisa menyelesaikannya. !!!")
            time.sleep(20)
            print("Waktu jeda selesai, melanjutkan skrip...")

        # Try to click expand button, dengan logging
        expand_clicked = False
        try:
            expand_container = driver.find_element(By.CSS_SELECTOR, "#property-information > div:nth-of-type(1) > div:nth-of-type(2)")
            expand_button = expand_container.find_element(By.CSS_SELECTOR, "span[data-test-id='expanded-specification']")
            ActionChains(driver).move_to_element(expand_button).perform()
            time.sleep(random.uniform(0.5, 1))
            expand_button.click()
            time.sleep(random.uniform(1, 2))
            expand_clicked = True
            print("Successfully clicked 'Muat lebih banyak' button.")
        except Exception as e:
            print(f"No 'Muat lebih banyak' button found or click failed: {str(e)}. Proceeding without expansion.")

        # Extract data (sama seperti sebelumnya)
        data = {}
        # ... (salin bagian extract data Anda di sini, saya skip untuk singkat)
        data_list.append(data)

        # Save incrementally
        df = pd.DataFrame(data_list)
        df.to_csv(csv_file, index=False)
        print(f"Your data saved to: {csv_file}")
        print(f"Total items saved: {len(data_list)}\n")

        time.sleep(random.uniform(1, 3))
        print(f"Going back to listing page {page}...")
        driver.get(url)
        time.sleep(random.uniform(2, 4))

    print(f"--Finished page {page}. Taking a break, waiting for 10 seconds...\n")
    time.sleep(10)

# Close the driver
driver.quit()

print("Scraping completed. Data saved to 'scraped_land_data.csv'.")

In [None]:
import pandas as pd

def read_scraped_data(filename='scraped_land_data.csv'):
    """Reads scraped data from a CSV file into a pandas DataFrame."""
    try:
        df = pd.read_csv(filename)
        print(f"Successfully loaded data from '{filename}'.")
        print(f"DataFrame shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return None
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return None

# Example usage (optional, you can remove this if you just want the function definition)
# scraped_df = read_scraped_data()
# if scraped_df is not None:
#     display(scraped_df.head())

In [None]:
scraped_df = read_scraped_data()
if scraped_df is not None:
    display(scraped_df.head())