In [2]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
def wait_pageloading(driver, wait):
    title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"The webpage \"{title}\" did not get fully loaded")
    else:
        print(f"The webpage \"{title}\" got fully loaded")

In [109]:
chrome_options = Options()
chrome_options.add_argument("--disable-http2")
chrome_options.add_argument("--icognito")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36")

driver  = webdriver.Chrome(options=chrome_options)
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# acessing the target webpage
url = 'https://www.99acres.com/'
driver.get(url)
wait_pageloading(driver, wait)

# identifying search bar 
try:
    search_bar = wait.until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="keyword2"]'))
    )
except:
    print("timeout while loacting search bar")
else:
    search_bar.send_keys("Chennai")
    time.sleep(2)


# selecting valid option from the dropdown list 
try:
    valid_opt = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="0"]'))
    )
except:
    print("timeout while locating valid search option")
else:
    valid_opt.click()
    time.sleep(2)
    
# clicking the search button
try:
    search_btn = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform_search_btn"]'))
    )
except:
    print("could not locate the search button and click it")
else:
    search_btn.click()
    wait_pageloading(driver, wait)

# locating the bdget ---> max_btn
try:
    budget_max = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="undefined"]'))
    )
except:
    print("max dropdown not available")
else:
    budget_max.click()
    time.sleep(2)

# selecting 5cr value
try:
    select_val = wait.until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="lf_budget_max_list"]/li[71]'))
    )
except:
    print("could not find the value")
else:
    select_val.click()
    time.sleep(2)

# applying various filter 
# 1. verified 
try:
    verified_opt = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='Verified']"))
    )
except:
    print("could not locate the verified option")
else:
    verified_opt.click()
    time.sleep(2)

# 2. clicking the ready_to_move button 
try:
    ready_to_move = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='Ready To Move']"))
    )
except:
    print("could not find the ready to move area")
else:
    ready_to_move.click()
    time.sleep(2)

# 3. clicking the next button 
while True:
    try:
        next_btn = wait.until(
            EC.presence_of_element_located((By.XPATH, "//i[contains(@class,'iconS_Common_24 icon_upArrow cc__rightArrow')]"))
        )
    except:
        print("Could not click the next button because we have reached till the end")
        break
    else:
        next_btn.click()
        time.sleep(1)

# 4. loacting the with_photo option 
try:
    with_pic = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='With Photos']"))
    )
except:
    print("could not load the photo area")
else:
    with_pic.click()
    time.sleep(1)

# 5. loacting the with_video option 
try:
    with_video = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//span[normalize-space()='With Videos']"))
    )
except:
    print("could not load the video area")
else:
    with_video.click()
    time.sleep(3)

# navigating the pages and extract data
data = []
page_count = 0
while True:
    page_count +=1
    try:
        # Wait for next button to be clickable
        next_page_btn = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//a[contains(text(),'Next')]"))
        )
    except:
        print(f"Timeout because we have navigated through all the {page_count} pages.\n")
        break
    else:
        try:
            driver.execute_script("window.scrollBy(0, arguments[0].getBoundingClientRect().top - 100);", next_page_btn)

            time.sleep(2)

            # scraping the data
            rows = driver.find_elements(By.CLASS_NAME, "tupleNew__contentWrap")
            for row in rows:
                # property name
                try:
                    name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
                except:
                    name = np.nan

                # property location 
                try:
                    location = row.find_element(By.CLASS_NAME, "tupleNew__propType").text
                except:
                    location = np.nan

                # property price 
                try:
                    price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
                except:
                    price = np.nan

                # property area and bhk
                try:
                    area_elem = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
                except:
                    area, bhk = [np.nan, np.nan]
                else:
                    area, bhk = [ele.text for ele in area_elem]

                property = {
                    "name": name,
                    "location": location,
                    "price": price,
                    "area": area,
                    "bhk": bhk
                }

                data.append(property)

            wait.until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(text(),'Next')]"))
            ).click()
            time.sleep(5)
        except:
            print("Timeout while clicking on \"Next page\" .\n")
        
# scraping data for the last page
rows = driver.find_elements(By.CLASS_NAME, "tupleNew__contentWrap")
for row in rows:
    # property name
    try:
        name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
    except:
        name = np.nan

    # property location 
    try:
        location = row.find_element(By.CLASS_NAME, "tupleNew__propType").text
    except:
        location = np.nan

    # property price 
    try:
        price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
    except:
        price = np.nan

    # property area and bhk
    try:
        area_elem = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
    except:
        area, bhk = [np.nan, np.nan]
    else:
        area, bhk = [ele.text for ele in area_elem]

    property = {
        "name": name,
        "location": location,
        "price": price,
        "area": area,
        "bhk": bhk
    }

    data.append(property)

time.sleep(2)
driver.close()


The webpage "India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com" got fully loaded
The webpage "Property in Chennai - Real Estate in Chennai" got fully loaded
Could not click the next button because we have reached till the end
Timeout while clicking on "Next page" .

Timeout because we have navigated through all the 18 pages.



In [None]:
df_property = (
    pd
    .DataFrame(data)
    .drop_duplicates()
    .apply(lambda col: col.str.strip().str.lower() if pd.api.types.is_string_dtype(col) else col)
    .assign(
        is_starred = lambda df_: df_.name.str.contains("\n").astype(int),
        name = lambda df_ : (
            df_ 
            .name
            .str.replace(r"\n[0-9.]+", '', regex=True)
            .str.strip()
        ),
        location = lambda df_:(
            df_ 
            .location
            .str.replace("chennai", "")
            .str.strip()
            .str.replace(r",$", "", regex=True)
            .str.split("in").str[-1].str.strip()
        ),
        price = lambda df_ :(
            df_ 
            .price
            .str.replace("â‚¹", "")
            .apply(
                lambda val: float(val.replace("lac", "").strip()) if "lac" in val else float(val.replace("cr", "").strip()) *100
            )
        ),
        area = lambda df_ :(
            df_ 
            .area
            .str.replace("sqft", "")
            .str.strip()
            .str.replace(",", "")
            .pipe(lambda ser: pd.to_numeric(ser, errors="coerce"))
        ),
        bhk = lambda df_ :(
            df_ 
            .bhk
            .str.replace("bhk", "")
            .str.strip()
            .pipe(lambda ser: pd.to_numeric(ser, errors="coerce"))
        )
    )
    .rename(columns = {
        "price" : "price_lakhs",
        "area": "area_sqft"
    })
    .reset_index(drop=True)
    .to_excel("chennai-properties-99acres.xlsx", index=False)
)
