# Setup

In [27]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time
import json
import polars as pl

# Constants

In [6]:
BASE_URL = "https://www.99.co/singapore/condos-apartments/alphabets/all"
EXCLUDE_KEYWORDS = ["showflat-reviews", "news", "guides"]

# Scraper

In [15]:
class PropertyScraper:
    def __init__(self, base_url, exclude_keywords=None, driver_path="chromedriver.exe", wait_time=5):
        self.base_url = base_url
        self.exclude_keywords = exclude_keywords or []
        self.driver = self._init_driver(driver_path)
        self.wait_time = wait_time

    def _init_driver(self, driver_path):
        service = Service(executable_path=driver_path)
        driver = webdriver.Chrome(service=service)
        return driver

    def get_property_links(self):
        self.driver.get(self.base_url)
        time.sleep(self.wait_time)

        property_links = []
        links = self.driver.find_elements(
            By.CSS_SELECTOR, "a[href^='/singapore/condos-apartments']")

        for link in links:
            url = link.get_attribute('href')
            if not any(keyword in url for keyword in EXCLUDE_KEYWORDS):
                property_links.append(url)

        return list(set(property_links))  # Remove duplicates
    
    def handle_see_more_button(self):
        try:
            # Check if the "See more" button exists
            buttons = self.driver.find_elements(
                By.XPATH, "//button[.//span[text()='See more']]")
            if buttons:
                # If button exists, wait for it to be clickable and then click
                see_more_button = WebDriverWait(self.driver, self.wait_time).until(
                    EC.element_to_be_clickable(
                        (By.XPATH, "//button[.//span[text()='See more']]")
                    )
                )
                see_more_button.click()
        except Exception as e:
            print(f"Failed to handle 'See more' button {e}")
        

    def scrape_property_details(self, property_url):
        self.driver.get(property_url)
        self.driver.maximize_window()
        try:
            # Wait for the table to load
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located(
                    (By.CLASS_NAME, "DetailsTable_value__tmYmT"))
            )
        except:
            print(f"Timeout loading {property_url}")
            return None

        rows = self.driver.find_elements(
            By.XPATH, "//tr[@itemprop='additionalProperty']")
        table_elements = {}
        for row in rows:
            # Find the label in the current row (inside the <h4> tag)
            label_element = row.find_element(
                By.XPATH, ".//h4[@itemprop='name']")
            label_text = label_element.text.strip().lower()

            # Find the corresponding value in the same row (inside the <td> tag)
            value_element = row.find_element(
                By.XPATH, ".//td[contains(@class, 'DetailsTable_value__tmYmT')]")
            value = value_element.text.strip() if value_element else None

            table_elements[label_text] = value

        res = {"property_name": ' '.join([s.capitalize() for s in property_url.split('/')[-1].split('-')]),
               "address": None,
               "number of units": None,
               "property type": None,
               "developer": None}

        for element in res.keys():
            if element in table_elements:
                if element == "address":
                    res["address"] = table_elements["address"].strip().split("View on map")[
                        0].strip()
                else:
                    res[element] = table_elements[element]

        # Press "See more" button if it exists
        self.handle_see_more_button()

        try:
            buildings = []

            # Locate the container with the class "Buildings_list__6pdZm"
            building_list_main = self.driver.find_element(
                By.CLASS_NAME, "Buildings_list__6pdZm")

            # Find all list items (li) inside the container
            building_items_main = building_list_main.find_elements(
                By.CLASS_NAME, "BuildingList_list-item__rlNXZ")

            for item in building_items_main:
                spans = item.find_elements(By.TAG_NAME, "span")
                if len(spans) == 2:  # Ensure there are two spans (address & postal code)
                    address = spans[0].text.strip()
                    postal_code = spans[1].text.strip()
                    buildings.append(
                        {"address": address, "postal_code": postal_code})

        except:
            buildings = []

        res["buildings"] = buildings
        res["property_url"] = property_url

        print(res)
        return res

    def scrape_all_properties(self):
        property_links = self.get_property_links()
        all_property_details = []
        cnt = 0

        for link in property_links:
            cnt += 1
            print(f"Scraping {cnt} / {len(property_links)} properties...")
            try:
                property_details = self.scrape_property_details(link)
                if property_details:
                    all_property_details.append(property_details)
            except:
                pass

        self.close()
        return all_property_details

    def close(self):
        self.driver.quit()

# Run

In [16]:
# Initialise scraper
scraper = PropertyScraper(BASE_URL, exclude_keywords=EXCLUDE_KEYWORDS)


property_data = scraper.scrape_all_properties()

Scraping 1 / 2896 properties...
{'property_name': 'Nicon Gardens', 'address': '61G Choa Chu Kang Road · 689395', 'number of units': '762', 'property type': 'Generic Condo', 'developer': 'Choa Chu Kang Land Ltd (Pidemco Land Ltd)', 'buildings': [{'address': '61 Choa Chu Kang Road', 'postal_code': '689388'}, {'address': '61A Choa Chu Kang Road', 'postal_code': '689389'}, {'address': '61B Choa Chu Kang Road', 'postal_code': '689390'}, {'address': '61C Choa Chu Kang Road', 'postal_code': '689391'}, {'address': '61D Choa Chu Kang Road', 'postal_code': '689392'}, {'address': '61E Choa Chu Kang Road', 'postal_code': '689393'}, {'address': '61F Choa Chu Kang Road', 'postal_code': '689394'}, {'address': '61G Choa Chu Kang Road', 'postal_code': '689395'}, {'address': '61H Choa Chu Kang Road', 'postal_code': '689396'}, {'address': '63 Choa Chu Kang Road', 'postal_code': '689397'}, {'address': '63A Choa Chu Kang Road', 'postal_code': '689398'}, {'address': '63B Choa Chu Kang Road', 'postal_code': 

In [24]:
# with open('property_data.json', 'w') as f:
#     json.dump(property_data, f)

# Process Data

In [95]:
# Convert to polars dataframe
property_df = (
    pl.DataFrame(property_data)
    .explode('buildings')
    .with_columns(
        pl.col('address')
        .str.split_exact(' · ', 1)
        .struct.rename_fields(['address_main', 'postal_code_main'])
        .alias('tmp'),

        pl.col('buildings').struct.field(
            'address').alias('address_by_building'),

        pl.col('buildings').struct.field(
            'postal_code').alias('postal_code_by_building'),
    )
    .unnest('tmp')
    .cast({'number of units': pl.Int64})
    .drop(['buildings', 'address'])
    .select('property_name', 'address_main', 'postal_code_main', 'property type', 'developer',
            'number of units', 'property_url', 'address_by_building', 'postal_code_by_building')
    .select(
        pl.all().name.map(lambda col_name: col_name.replace(' ', '_'))
    )
    .sort(['property_name', 'postal_code_by_building'], descending=False)
)

In [None]:
# Write results
property_df.write_csv('condo_data.csv')