In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

from seleniumbase import Driver
import datetime

### Get All Link in Current Page

In [2]:
def get_all_link(driver) -> list:
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")

    card_containers = my_html.find_all('div', class_='cardSecondary__info-wrapper_detail-basic-content')

    nav_link = []
    for container in card_containers:
        property_link = container.find('a', href=True)
        href = property_link['href'] if property_link else "unknown"
        if href.__contains__('projects'):
            continue
        nav_link.append(href)
    
    return nav_link

### Get Data in Detail Page

In [3]:
%pip install googlemaps -q

Note: you may need to restart the kernel to use updated packages.


In [4]:
import googlemaps

api_key = 'AIzaSyDtLohDGDggrI8quTh7wDFHh-O-I3SH6oY'
gmaps = googlemaps.Client(key=api_key)

def get_full_address(given_address):
    geocode_result = gmaps.geocode(given_address)
    if geocode_result:
        return geocode_result[0]['formatted_address']
    else:
        return "Address not found"

def get_lat_long(given_address):
    geocode_result = gmaps.geocode(given_address)
    if geocode_result:
        location = geocode_result[0]['geometry']['location']
        return (location['lat'], location['lng'])
    else:
        return "Coordinates not found"

def format_updated_time(raw_time):
    # Mapping bulan dari bahasa Indonesia ke bahasa Inggris
    bulan_mapping = {
        "Jan": "January",
        "Feb": "February",
        "Mar": "March",
        "Apr": "April",
        "Mei": "May",
        "Jun": "June",
        "Jul": "July",
        "Agu": "August",
        "Sep": "September",
        "Okt": "October",
        "Nov": "November",
        "Des": "December"
    }
    
    # Pisahkan tanggal dan bulan
    parts = raw_time.strip().split()
    # Ubah bulan ke bahasa Inggris
    parts[1] = bulan_mapping[parts[1]]
    # Gabungkan kembali menjadi string
    converted_time = " ".join(parts)
    
    # Konversi string yang telah diubah menjadi objek datetime
    date_obj = datetime.datetime.strptime(converted_time, "%d %B %Y")
    return date_obj.strftime("%d/%m/%Y")

In [5]:
from selenium.common.exceptions import NoSuchElementException

def get_data_each_page(driver):
    try:
        driver.click('button:contains("Lihat Selengkapnya")')
    except NoSuchElementException:
        pass
    
    my_page = driver.page_source
    my_html = BeautifulSoup(my_page, "html.parser")
    
    # Updated time
    waktu_diperbaharui = "N/A"
    listing_badges = my_html.find('div', class_='listingBadges')
    if listing_badges:
        badges = listing_badges.find_all('div', class_='badge')
        for badge in badges:
            span = badge.find('span')
            if span and "Diperbaharui" in span.text:
                raw_time = span.text.split(":")[1].strip()
                waktu_diperbaharui = format_updated_time(raw_time)
                break
            
    # Price
    harga = "N/A"
    listing_price = my_html.find('div', class_='listingPrice')
    if listing_price:
        price_tag = listing_price.find('div', class_='listingPrice__tag')
        if price_tag:
            harga = price_tag.find('strong').text.strip()
            
    # Address
    address = "N/A"
    district = "N/A"
    city = "N/A"
    lat = 'N/A'
    long = 'N/A'
    address_elem = my_html.find('address', class_='wrapper-address')
    if address_elem:
        address = address_elem.text
        lat, long = get_lat_long(address)
        address_parts = address.split(',')
        if len(address_parts) >= 2:
            district = address_parts[0].strip()
            city = address_parts[1].strip()
    
    detail_dict = {
        "district": district,
        "city": city,
        "latitude": lat,
        "longitude": long,
        "updated_time": waktu_diperbaharui,
        "price": harga,
        "land_area_sqm": "N/A",
        "building_area_sqm": "N/A",
        "jumlah lantai": "N/A",
        "kamar tidur": "N/A",
        "kamar mandi": "N/A",
        "carports": "N/A",
        "garasi": "N/A",
        "sertifikat": "N/A",
        "daya listrik": "N/A",
        "interior": "N/A",
        "tahun dibangun": "N/A"
    }
    
    # Get land size and building size
    overview = my_html.find('div', class_='listingOverviewDefault')
    if overview:
        overview_items = overview.find_all('div')
        for item in overview_items:
            text = item.text.strip()
            if text.startswith("LT"):
                detail_dict["land_area_sqm"] = text[2:].strip()
            elif text.startswith("LB"):
                detail_dict["building_area_sqm"] = text[2:].strip()
                
    # Other details in table
    details = my_html.find('section', id='detailRef')
    if details:
        detail_table = details.find('table')
        if detail_table:
            rows = detail_table.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                for i in range(0, len(cells), 2):
                    if i + 1 < len(cells):
                        header = cells[i].text.strip().lower()
                        description = cells[i + 1].text.strip()
                        if header in detail_dict:
                            detail_dict[header] = description

    return detail_dict

## Iterate to all pages

In [6]:
data = []

In [7]:
pagenum = 1
driver = Driver(uc=True)

while pagenum <= 2:
  print('Page: ', pagenum)
  driver.get(f"https://www.99.co/id/jual/rumah/jakarta?hlmn={pagenum}")
  all_link_each_page = get_all_link(driver)
  
  count = 1
  for link in all_link_each_page:
    driver.get(link)
    data.append(get_data_each_page(driver))
    print('Rumah ke: ', count)
    count += 1

  pagenum += 1

Page:  1
Rumah ke:  1
Rumah ke:  2
Rumah ke:  3
Rumah ke:  4
Rumah ke:  5
Rumah ke:  6


In [None]:
df = pd.DataFrame(data)

df.to_csv('property_listings.csv', index=False)

print("Data saved to property_listings.csv")