In [76]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import os

In [77]:
# Daftar User-Agent untuk rotasi
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
]

In [157]:
# base_url = "https://www.rumah123.com/jual/salatiga/salatiga/rumah/?itm_medium=search-suggestions-lokasi&itm_source=organic"
region = 'badung'
base_url = f"https://www.rumah123.com/jual/{region}/rumah/"

In [79]:
# Fungsi untuk mengubah harga menjadi integer
def convert_price(price_str):
    try:
        match = re.match(
            r'Rp\s*([\d.,]+)\s*(Miliar|Juta|Ribu)?', price_str, re.IGNORECASE)
        if not match:
            return None

        value = match.group(1)
        unit = match.group(2)

        value = float(value.replace('.', '').replace(',', '.'))

        if unit and 'miliar' in unit.lower():
            value *= 1_000_000_000
        elif unit and 'juta' in unit.lower():
            value *= 1_000_000
        elif unit and 'ribu' in unit.lower():
            value *= 1_000

        return int(value)
    except Exception as e:
        print(f"Error converting price: {e}")
        return None

In [80]:
def convert_area(land_str):
    try:
        match = re.match(r':\s*([\d,]+)\s*m\xb2', land_str)
        if not match:
            return None

        value = match.group(1)
        value = int(value.replace(',', ''))
        return value
    except Exception as e:
        print(f"Error converting land area: {e}")
        return None

In [81]:
def crawl_page(url):
    retry_count = 0
    max_retries = 5

    while retry_count < max_retries:
        headers = {
            "User-Agent": random.choice(USER_AGENTS)
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 429:
            wait = 2 ** retry_count  # tunggu secara eksponensial
            print(f"Retrying {url} ({retry_count+1})")
            time.sleep(wait)
            retry_count += 1
        else:
            break

    if response.status_code != 200:
        print(f"Gagal mengakses {url} : {response.json}")
        return []

    soup = BeautifulSoup(response.content, "html.parser")
    property_list = []

    for listing in soup.select(".ui-atomic-card .card-featured__middle-section"):
        try:
            title = listing.find("a").text.strip(
            ) if listing.find("a") else "-"
            url = listing.find("a").get("href") if listing.find("a") else "-"

            price = listing.select_one(".card-featured__middle-section__price").text.strip(
            ) if listing.select_one(".card-featured__middle-section__price") else "-"
            price = convert_price(price) if price != "-" else None

            spans = listing.find_all("span")
            location = spans[1].text.strip() if len(spans) > 1 else "-"
            location = spans[2].text.strip() if "Cicilan" in location else location

            attribute_info = listing.select(".attribute-info")

            land_area = attribute_info[0].text.strip() if len(
                attribute_info) > 1 else "-"
            land_area = convert_area(land_area) if land_area != "-" else None

            building_area = attribute_info[1].text.strip() if len(
                attribute_info) > 1 else "-"
            building_area = convert_area(
                building_area) if building_area != "-" else None

            property_list.append({
                "title": title,
                "price": price,
                "address": location,
                "land_area": land_area,
                "building_area": building_area,
                "url": url
            })

        except Exception as e:
            print(f"Error parsing listing: {e}")
            continue
    return property_list

In [82]:
# Fungsi utama untuk crawling banyak halaman
def crawl_rumah123(base_url, start_page=1, total_pages=1):
    all_properties = []

    for page in range(start_page, start_page+total_pages):
        url = f"{base_url}?page={page}"
        properties = crawl_page(url)
        if properties:
            print(f"Scraping halaman {page}...")
            all_properties.extend(properties)

        # Tambahkan jeda acak antara 2-5 detik untuk menghindari blokir
        time.sleep(random.uniform(2, 5))

    return all_properties

In [158]:
data = crawl_rumah123(base_url,1,50)

Scraping halaman 1...
Scraping halaman 2...
Scraping halaman 3...
Scraping halaman 4...
Scraping halaman 5...
Scraping halaman 6...
Scraping halaman 7...
Scraping halaman 8...
Scraping halaman 9...
Scraping halaman 10...
Scraping halaman 11...
Scraping halaman 12...
Scraping halaman 13...
Scraping halaman 14...
Scraping halaman 15...
Scraping halaman 16...
Scraping halaman 17...
Scraping halaman 18...
Scraping halaman 19...
Scraping halaman 20...
Scraping halaman 21...
Scraping halaman 22...
Scraping halaman 23...
Scraping halaman 24...
Scraping halaman 25...
Scraping halaman 26...
Scraping halaman 27...
Scraping halaman 28...
Scraping halaman 29...
Scraping halaman 30...
Scraping halaman 31...
Scraping halaman 32...
Scraping halaman 33...
Scraping halaman 34...
Scraping halaman 35...
Scraping halaman 36...
Scraping halaman 37...
Scraping halaman 38...
Scraping halaman 39...
Scraping halaman 40...
Scraping halaman 41...
Scraping halaman 42...
Scraping halaman 43...
Scraping halaman 44.

In [159]:
if data:
    df = pd.DataFrame(data)
    df

    # Simpan ke file Excel
    filename = f"harga_rumah_{region}"
    filepath = f"{filename}.xlsx"

    if os.path.exists(filepath):
        existing_df = pd.read_excel(filepath)
        final_df = pd.concat([existing_df, df], ignore_index=True)
    else:
        final_df = df
    final_df.to_excel(filepath, index=False)
    print(len(final_df))

    print(f"Data berhasil disimpan ke {filepath}")
else:
    print("Tidak ada data yang ditemukan.")

1000
Data berhasil disimpan ke harga_rumah_badung.xlsx


In [71]:
url = 'https://www.rumah123.com/jual/jakarta-pusat/rumah/?page=160'
retry_count = 0
max_retries = 5

while retry_count < max_retries:
    headers = {
        "User-Agent": random.choice(USER_AGENTS)
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 429:
        wait = 2 ** retry_count  # tunggu secara eksponensial
        print(f"Retrying {url} ({retry_count+1})")
        time.sleep(wait)
        retry_count += 1
    else:
        break

if response.status_code != 200:
    print(f"Gagal mengakses {url} : {response.json}")

soup = BeautifulSoup(response.content, "html.parser")
property_list = []

for listing in soup.select(".ui-atomic-card .card-featured__middle-section"):
    try:
        title = listing.find("a").text.strip(
        ) if listing.find("a") else "-"
        url = listing.find("a").get("href") if listing.find("a") else "-"

        price = listing.select_one(".card-featured__middle-section__price").text.strip(
        ) if listing.select_one(".card-featured__middle-section__price") else "-"
        price = convert_price(price) if price != "-" else None

        spans = listing.find_all("span")
        location = spans[1].text.strip() if len(spans) > 1 else "-"
        location = spans[2].text.strip() if "Cicilan" in location else location

        attribute_info = listing.select(".attribute-info")

        land_area = attribute_info[0].text.strip() if len(
            attribute_info) > 1 else "-"
        land_area = convert_area(land_area) if land_area != "-" else None

        building_area = attribute_info[1].text.strip() if len(
            attribute_info) > 1 else "-"
        building_area = convert_area(
            building_area) if building_area != "-" else None

        property_list.append({
            "title": title,
            "price": price,
            "address": location,
            "land_area": land_area,
            "building_area": building_area,
            "url": url
        })

    except Exception as e:
        print(f"Error parsing listing: {e}")
        continue

property_list

[{'title': 'Rumah Cantik Full Furnished Area Istimewa Benhil, Jakarta Pusat',
  'price': 5800000000,
  'address': 'Bendungan Hilir, Jakarta Pusat',
  'land_area': 126,
  'building_area': 250,
  'url': '/properti/jakarta-pusat/hos17253723/'},
 {'title': 'Dijual Rumah  2,5 Lantai Di Pusat Kota Jakarta Lingkungan Nyaman Bebas Banjir Di Bendungan Hilir Jakarta Pusat',
  'price': 6500000000,
  'address': 'Bendungan Hilir, Jakarta Pusat',
  'land_area': 126,
  'building_area': 180,
  'url': '/properti/jakarta-pusat/hos17253561/'},
 {'title': 'Rumah Mewah Modern Classic di Menteng Jakarta Pusat 2 Lantai SHM Bagus',
  'price': 90000000000,
  'address': 'Menteng, Jakarta Pusat',
  'land_area': 876,
  'building_area': 1400,
  'url': '/properti/jakarta-pusat/hos17253514/'},
 {'title': 'Dijual  Rumah baru 100% Taman solo /Cempaka putih tengah Jakpus',
  'price': 7200000000,
  'address': 'Cempaka Putih, Jakarta Pusat',
  'land_area': 180,
  'building_area': 250,
  'url': '/properti/jakarta-pusat/ho

In [70]:
"Cicilan" in spans[0].text

True