# Web Scraping Rumah 123

## Import library yang dibutuhkan

In [None]:
import time
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup

## Fungsi untuk melakukan konversi mata uang ke int

In [3]:
def convert_harga_to_int(str):
    str = str.replace(",", ".")
    str_split = str.split()

    if len(str_split) <= 2:
        harga = int(float(str_split[1]) * 1_000_000)
        return harga
    else:
        if str_split[2].lower() == "juta":
            harga = int(float(str_split[1]) * 1_000_000)
            return harga
        elif str_split[2].lower() == "miliar":
            harga = int(float(str_split[1]) * 1_000_000_000)
            return harga

## Create function to scrape the website

This function scrapes the website and creates an object based on the values it got. While scraping, the functions also preprocess the data so no null/bad values is added to the object list


In [None]:
def scraper(firstpage, lastpage):
    listOfHouse = []
    i = firstpage
    while i < lastpage + 1:
        base_url = "https://www.rumah123.com/jual/surabaya/rumah"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
            "Accept-Encoding": "gzip",
            "Connection": "keep-alive",
        }
        response = requests.get(base_url, params={"page": i}, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        wrapper_el = soup.find_all("div", {"class": "card-featured__content-wrapper"})

        for prop in tqdm(wrapper_el):
            # Title
            property_title = prop.find("h2").text

            # Price
            property_price = convert_harga_to_int(
                prop.find("div", class_="card-featured__middle-section__price").text
            )

            # jika harga 0, skip
            if property_price == 0:
                continue

            # Get wrapper element for
            property_featured_middle = prop.find(
                "div", class_="card-featured__middle-section__attribute"
            )
            # bathroom, bedroom, and garage capacity
            property_element = property_featured_middle.findAll(
                "span", class_="attribute-text"
            )
            # array of max 3 elements (kt km g)
            kt = 0 if len(property_element) < 1 else int(property_element[0].text)
            km = 0 if len(property_element) < 2 else int(property_element[1].text)
            garasi = 0 if len(property_element) < 3 else int(property_element[2].text)

            # info regarding land and building area
            property_area = property_featured_middle.findAll(
                "div", class_="attribute-info"
            )
            # LT = Luas tanah, LB = Luas bangunan
            property_LT = (
                0
                if len(property_area) < 1
                else int(property_area[0].find("span").text.split(" ")[0])
            )
            property_LB = (
                0
                if len(property_area) < 2
                else int(property_area[1].find("span").text.split(" ")[0])
            )

            if not (km != 0 or kt != 0 or property_LT != 0 or property_LB != 0):
                continue

            listOfHouse.append(
                [
                    property_price,
                    kt,
                    km,
                    garasi,
                    int(property_LT),
                    int(property_LB),
                ]
            )
            # wait 2 second
            time.sleep(2)

        i += 1

    print(f"Scraping page from {firstpage} until {lastpage} has finished!")
    return listOfHouse

## Testing the web scraper function

This testing is done to make sure the web scraper isn't blocked by the website and is able to get data from the website


In [None]:
# testing to test if i'm blocked from the web
test = scraper(1, 1)

100%|██████████| 20/20 [00:40<00:00,  2.00s/it]



Scraping page from 1 until 1 has finished!





In [10]:
test

[[1500000000, 4, 2, 1, 135, 150],
 [1110000000, 3, 2, 1, 50, 70],
 [1310000000, 3, 2, 1, 70, 70],
 [1650000000, 3, 3, 1, 84, 130],
 [770000000, 3, 2, 1, 55, 64],
 [505000000, 2, 1, 1, 60, 36],
 [2500000000, 6, 5, 0, 160, 120],
 [13700000000, 5, 6, 2, 330, 600],
 [3500000000, 6, 5, 1, 144, 185],
 [3600000000, 3, 3, 2, 154, 250],
 [3420000000, 4, 3, 1, 375, 384],
 [3300000000, 4, 3, 0, 140, 200],
 [6000000000, 6, 4, 1, 593, 440],
 [2800000000, 4, 3, 2, 384, 250],
 [7000000000, 2, 2, 0, 458, 850],
 [900000000, 3, 2, 1, 50, 82],
 [600000000, 2, 1, 1, 60, 48],
 [27000000000, 10, 9, 1, 1200, 900],
 [1060000000, 2, 2, 1, 60, 57],
 [7000000000, 4, 4, 2, 337, 350]]

## Scraping rumah123.com

The actual scraping. In order to not overload the website, I scraped the website with some intervals. This is done so that we don't get blocked because our suspicious requests made to the website


In [None]:
data = scraper(1, 50)

In [None]:
# checking if scraping is successful, length must be > 0
len(data)

## Web Scrape 2nd Interval


In [None]:
data += scraper(51, 100)

In [None]:
len(data)

## Data Preprocessing

Because all the data are valid, we only need to concatenate the HouseData object list into a single list


In [None]:
import pandas as pd

df = pd.DataFrame(
    data=data,
    columns=[
        "price",
        "num_bedroom",
        "num_bathroom",
        "garage_capacity",
        "land_area",
        "building_area",
    ],
)

df

Unnamed: 0,price,num_bedroom,num_bathroom,garage_capacity,land_area,building_area
0,1500000000,4,2,1,135,150
1,1110000000,3,2,1,50,70
2,1310000000,3,2,1,70,70
3,1650000000,3,3,1,84,130
4,770000000,3,2,1,55,64
5,505000000,2,1,1,60,36
6,2500000000,6,5,0,160,120
7,13700000000,5,6,2,330,600
8,3500000000,6,5,1,144,185
9,3600000000,3,3,2,154,250


In [None]:
df.to_csv("data/surabaya-house-prices.csv", index=False)