In [12]:
import numpy as np
import pandas as pd
import requests
import urllib.parse
import json
import os
import re
import time
import random
import undetected_chromedriver as uc

from datetime import datetime, timedelta
from google.oauth2.service_account import Credentials
from geopy.geocoders import Nominatim
from cryptography.fernet import Fernet

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from seleniumbase import SB

target_table = "real_estate.jakarta"
target_table_2 = "real_estate.most_recent"
project_id = "jakarta-housing-price"
job_location = "asia-southeast2"

# Decrypt the credentials file
def decrypt_file(encrypted_file, key):
    cipher_suite = Fernet(key)
    with open(encrypted_file, "rb") as file:
        encrypted_data = file.read()
    decrypted_data = cipher_suite.decrypt(encrypted_data)
    return json.loads(decrypted_data.decode("utf-8"))

# Get the FERNET_KEY from the environment
fernet_key = os.environ.get("FERNET_KEY")
decrypted_credentials = decrypt_file("encryption/encrypted_data.bin", fernet_key)
credential = Credentials.from_service_account_info(decrypted_credentials)

query_most_recent = pd.read_gbq(f"SELECT * FROM `{project_id}.{target_table_2}`", project_id=project_id, credentials=credential)
query_most_recent["date"] = pd.to_datetime(query_most_recent["date"])

# Lists to Store the Scraped Data
titles = []
links = []
locations = []
prices = []
bedrooms = []
bathrooms = []
garages = []
land_areas = []
building_areas = []
agents = []
dates = []

# Iterate through Each Page
conditions_met = False

with SB(uc_cdp=True, guest_mode=True) as sb:
    for page in range(1, 101):
        try:
            sb.open(f"https://www.rumah123.com/jual/dki-jakarta/rumah/?sort=posted-desc&page={page}#qid~a46c0629-67e4-410c-9c35-0c80e98987d9")

            if sb.is_element_visible('input[value*="Verify"]'):
                sb.click('input[value*="Verify"]')
                if page == 1:
                    sb.save_screenshot("page_screenshot_before.png")
            elif sb.is_element_visible('iframe[title*="challenge"]'):
                sb.switch_to_frame('iframe[title*="challenge"]')
                sb.click("span.mark")
                if page == 1:
                    sb.save_screenshot("page_screenshot_before.png")
            else:
                if page == 1:
                    sb.save_screenshot("page_screenshot_before.png")

            property_elements = sb.find_elements(By.XPATH, "//div[contains(@class, 'card-featured__content-wrapper')]")
            print(property_elements)

            # Iterate through Each Property Element
            index = 0
            for element in property_elements:
                try:
                    # Title
                    try:
                        title_element = element.find_element(By.XPATH, ".//a[h2]")
                        title = title_element.get_attribute("title")
                    except NoSuchElementException:
                        title = float("nan")

                    # Link
                    try:
                        link = title_element.get_attribute("href")
                    except NoSuchElementException:
                        link = float("nan")

                    # Location
                    try:
                        location = element.find_element(By.XPATH, ".//span[contains(text(), ',')]").text
                    except NoSuchElementException:
                        location = float("nan")

                    # Price
                    try:
                        price = element.find_element(By.CLASS_NAME, "card-featured__middle-section__price").text
                    except NoSuchElementException:
                        price = float("nan")

                    # Features
                    features_element = element.find_elements(By.XPATH, ".//div[@class='attribute-grid']/span[@class='attribute-text']")

                    # Extracting the attributes (like bedroom, bathroom, garage) from features_element
                    attributes = [float("nan")] * 3

                    for idx, attr_elem in enumerate(features_element[:3]):
                        text_content = attr_elem.text
                        if text_content.isdigit():
                            attributes[idx] = int(text_content)

                    bedroom, bathroom, garage = attributes

                    # Land Area
                    try:
                        land_area_text = element.find_element(By.XPATH, ".//div[contains(text(), 'LT : ')]/span").text.strip()
                        land_area = int(re.search(r"\d+", land_area_text).group()) if re.search(r"\d+", land_area_text) else float("nan")
                    except NoSuchElementException:
                        land_area = float("nan")

                    # Building Area
                    try:
                        building_area_text = element.find_element(By.XPATH, ".//div[contains(text(), 'LB : ')]/span").text.strip()
                        building_area = int(re.search(r"\d+", building_area_text).group()) if re.search(r"\d+", building_area_text) else float("nan")
                    except NoSuchElementException:
                        building_area = float("nan")

                    # Agent & Date
                    try:
                        agent_date_element = element.find_element(By.CLASS_NAME, "ui-organisms-card-r123-basic__bottom-section__agent")
                        
                        time_info = agent_date_element.find_element(By.XPATH, ".//p[1]").text
                        time_pattern = re.compile(r'(\d+\s\w+)')
                        time_match = time_pattern.search(time_info)

                        if time_match:
                            agent = time_match.group(1)
                        else:
                            agent = float("nan")

                        date = agent_date_element.find_element(By.XPATH, ".//p[2]").text.strip()
                    except NoSuchElementException:
                        agent = float("nan")
                        date = float("nan")

                    print(f"House {index + 1} (Page {page}):")

                    titles.append(title)
                    print(f"Title: {title}")

                    links.append(link)
                    print(f"Link: {link}")

                    locations.append(location)
                    print(f"Location: {location}")

                    prices.append(price)
                    print(f"Price: {price}")

                    bedrooms.append(bedroom)
                    print(f"Bedroom: {bedroom}")

                    bathrooms.append(bathroom)
                    print(f"Bathroom: {bathroom}")

                    garages.append(garage)
                    print(f"Garage: {garage}")

                    land_areas.append(land_area)
                    print(f"Land Area: {land_area}")

                    building_areas.append(building_area)
                    print(f"Building Area: {building_area}")

                    agents.append(date)
                    print(f"Agent: {date}")

                    def subtract_time_from_now(time_string):
                        time_parts = time_string.split()

                        number = int(time_parts[0])
                        unit = time_parts[-1]

                        now = datetime.now() + timedelta(hours=7)
                        # now = datetime.now()

                        if unit.lower() == "detik":
                            return now - timedelta(seconds=number)
                        elif unit.lower() == "menit":
                            return now - timedelta(minutes=number)
                        elif unit.lower() == "jam":
                            return now - timedelta(hours=number)
                        elif unit.lower() == "hari":
                            return now - timedelta(days=number)
                        else:
                            raise ValueError("Unknown time unit!")
                        
                    agent = subtract_time_from_now(agent)
                    dates.append(agent)
                    print(f"Date: {agent}")

                    print("--------------------")

                    # Check If Conditions Are Met
                    if title == query_most_recent["title"][0] and link == query_most_recent["link"][0] and \
                            location == query_most_recent["address"][0] and date == query_most_recent["agent"][0]:
                        print("CONDITIONS ARE MET")
                        conditions_met = True
                        break

                    index += 1

                except NoSuchElementException:
                    continue

            if page == 1:
                sb.save_screenshot("page_screenshot_after.png")

            if conditions_met:
                break

        except Exception as e:
            print(f"Error on page {page}: {str(e)}")
            continue

df = pd.DataFrame({
    "Title": titles,
    "Link": links,
    "Address": locations,
    "Bedroom": bedrooms,
    "Bathroom": bathrooms,
    "Garage": garages,
    "Land m2": land_areas,
    "Building m2": building_areas,
    "Price": prices,
    "Agent": agents,
    "Date": dates
})

df.to_csv("scraped_data.csv", index=False)

TypeError: argument should be a bytes-like object or ASCII string, not 'NoneType'

In [2]:
pip install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.3.tar.gz (65 kB)
     -------------------------------------- 65.3/65.3 kB 706.2 kB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting selenium>=4.9.0
  Downloading selenium-4.13.0-py3-none-any.whl (9.5 MB)
     ---------------------------------------- 9.5/9.5 MB 3.6 MB/s eta 0:00:00
Collecting websockets
  Downloading websockets-11.0.3-cp310-cp310-win_amd64.whl (124 kB)
     ------------------------------------ 124.7/124.7 kB 198.0 kB/s eta 0:00:00
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting trio~=0.17
  Using cached trio-0.22.2-py3-none-any.whl (400 kB)
Collecting outcome
  Using cached outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting exceptiongroup>=1.0.0rc9
  Using cached exceptiongroup-1.1.3-py3-none-any.whl (14 kB)
Collecting wsproto>=0.14
  Using cached wsproto-1.2.

In [4]:
pip install --upgrade google-auth

Note: you may need to restart the kernel to use updated packages.Collecting google-auth
  Downloading google_auth-2.23.2-py2.py3-none-any.whl (181 kB)
     ------------------------------------ 182.0/182.0 kB 255.6 kB/s eta 0:00:00
Collecting rsa<5,>=3.1.4
  Downloading rsa-4.9-py3-none-any.whl (34 kB)
Collecting cachetools<6.0,>=2.0.0
  Downloading cachetools-5.3.1-py3-none-any.whl (9.3 kB)
Installing collected packages: rsa, cachetools, google-auth
Successfully installed cachetools-5.3.1 google-auth-2.23.2 rsa-4.9





In [6]:
pip install geopy

Collecting geopy
  Downloading geopy-2.4.0-py3-none-any.whl (125 kB)
     ------------------------------------ 125.4/125.4 kB 273.0 kB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     -------------------------------------- 40.3/40.3 kB 174.9 kB/s eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.0 webdriver-manager-4.0.1
Note: you may need to restart the kernel to use updated packages.




In [10]:
pip install seleniumbase

Collecting seleniumbaseNote: you may need to restart the kernel to use updated packages.

  Downloading seleniumbase-4.19.1-py3-none-any.whl (542 kB)
     ------------------------------------ 542.5/542.5 kB 400.8 kB/s eta 0:00:00
Collecting pip>=23.2.1
  Using cached pip-23.2.1-py3-none-any.whl (2.1 MB)
Collecting packaging>=23.2
  Downloading packaging-23.2-py3-none-any.whl (53 kB)
     -------------------------------------- 53.0/53.0 kB 388.8 kB/s eta 0:00:00
Collecting parse>=1.19.1
  Downloading parse-1.19.1-py2.py3-none-any.whl (18 kB)
Collecting pygments==2.16.1
  Downloading Pygments-2.16.1-py3-none-any.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 419.8 kB/s eta 0:00:00
Collecting beautifulsoup4==4.12.2
  Using cached beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
Collecting filelock>=3.12.4
  Downloading filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting wheel>=0.41.2
  Downloading wheel-0.41.2-py3-none-any.whl (64 kB)
     ---------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder-kernels 2.4.1 requires jupyter-client<8,>=7.3.4; python_version >= "3", but you have jupyter-client 8.2.0 which is incompatible.
distributed 2022.7.0 requires tornado<6.2,>=6.0.3, but you have tornado 6.3.1 which is incompatible.
conda-repo-cli 1.0.27 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.27 requires nbformat==5.4.0, but you have nbformat 5.7.0 which is incompatible.
conda-repo-cli 1.0.27 requires requests==2.28.1, but you have requests 2.31.0 which is incompatible.
