In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time

def initialize_driver():
    chrome_options = Options()
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    service = Service('/Users/arundhuti/Downloads/chromedriver-mac-arm64/chromedriver')
    return webdriver.Chrome(service=service, options=chrome_options)

# Initialize WebDriver and navigate to the main page
driver = initialize_driver()
driver.get("https://www.99acres.com/real-estate-city-insights-lrffid?preference=RENTAL&src_page=HP")

# Find all city elements and extract URLs
city_elements = driver.find_elements(By.CLASS_NAME, 'ccd__heading')
city_names = [element.text.replace("Localities in ", "") for element in city_elements]
print("City Names:", city_names)

city_urls = [element.get_attribute('href') for element in city_elements if element.get_attribute('href') is not None]
print("City URLs:", city_urls)

with open('city_urls.json', 'w') as file:
    json.dump(city_urls, file)

print(f"Extracted {len(city_urls)} city URLs.")
driver.quit()

# Process each URL to extract localities and their hrefs
city_localities = {}

with open('city_urls.json', 'r') as file:
    city_urls = json.load(file)

for url in city_urls:
    print(f"Processing URL: {url}")
    driver = initialize_driver()
    
    try:
        driver.get(url)
        
        # Wait for locality elements to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'section_header_semiBold'))
        )

        # Find all <a> tags with class 'section_header_semiBold'
        locality_elements = driver.find_elements(By.CLASS_NAME, 'section_header_semiBold')
        
        # Extract locality names (text) and URLs (href)
        localities = [{"name": element.get_attribute('title'), "url": element.get_attribute('href')} for element in locality_elements if element.get_attribute('href') is not None]
        
        city_name = url.split('/')[-1].split('-lrffid')[0]  # Adjust as necessary
        city_localities[city_name] = localities

    except Exception as e:
        print(f"Error processing URL {url}: {e}")

    finally:
        driver.quit()
        time.sleep(10)  # Pause to avoid overloading the server

# Save the localities data (name and URL) to a file
with open('city_localities.json', 'w') as file:
    json.dump(city_localities, file, indent=4)

print("Data saved to city_localities.json.")




City Names: ['Bangalore', 'Delhi', 'Hyderabad', 'Pune', 'Chennai', 'Mumbai', 'Kolkata', 'Navi Mumba...', 'Gurgaon', 'Ahmedabad', 'Jaipur', 'Lucknow', 'Ghaziabad', 'Nagpur', 'Noida']
City URLs: ['https://www.99acres.com/localities-in-bangalore-lrffid', 'https://www.99acres.com/localities-in-delhi-lrffid', 'https://www.99acres.com/localities-in-hyderabad-lrffid', 'https://www.99acres.com/localities-in-pune-lrffid', 'https://www.99acres.com/localities-in-chennai-lrffid', 'https://www.99acres.com/localities-in-mumbai-lrffid', 'https://www.99acres.com/localities-in-kolkata-lrffid', 'https://www.99acres.com/localities-in-navi-mumbai-lrffid', 'https://www.99acres.com/localities-in-gurgaon-lrffid', 'https://www.99acres.com/localities-in-ahmedabad-lrffid', 'https://www.99acres.com/localities-in-jaipur-lrffid', 'https://www.99acres.com/localities-in-lucknow-lrffid', 'https://www.99acres.com/localities-in-ghaziabad-lrffid', 'https://www.99acres.com/localities-in-nagpur-lrffid', 'https://www.99acr

In [29]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def initialize_driver():
    chrome_options = Options()
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    chrome_options.add_argument("--start-maximized")  # Open browser in full screen
    service = Service('/Users/arundhuti/Downloads/chromedriver-mac-arm64/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.maximize_window()  # Ensure the window is maximized
    return driver

import time

def scroll_page(driver):
    logging.info("Starting slow scroll down the page...")
    
    total_scroll_height = driver.execute_script("return document.body.scrollHeight")
    visible_window_height = driver.execute_script("return window.innerHeight")

    if total_scroll_height > visible_window_height:
        scroll_pause_time = 20  # 20 seconds delay
        scroll_increment = total_scroll_height // 16  # Scroll by 1/16th of the page

        last_height = driver.execute_script("return document.documentElement.scrollTop")

        while True:
            driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
            logging.info(f"Scrolled to {driver.execute_script('return document.documentElement.scrollTop'):.2f} / {total_scroll_height:.2f}")

            # Ensure page is fully loaded before proceeding
            page_state = driver.execute_script("return document.readyState")
            while page_state != 'complete':
                time.sleep(1)
                page_state = driver.execute_script("return document.readyState")
            
            # Explicitly wait for 20 seconds between scrolls
            time.sleep(scroll_pause_time)

            new_scroll_position = driver.execute_script("return document.documentElement.scrollTop")
            
            if new_scroll_position == last_height:
                break
            
            last_height = new_scroll_position

        logging.info("Finished scrolling the page.")
    else:
        logging.info("No vertical scrolling needed.")


def scrape_properties(locality_url):
    driver = initialize_driver()
    properties = []

    try:
        driver.get(locality_url)
        scroll_slowly(driver)

        # Wait for the property cards to load (30 seconds wait)
        logging.info("Waiting for property cards to load...")
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'ProgressBarItemContainer__title'))
        )
        logging.info("Property cards loaded.")

        # Find all property cards
        property_elements = driver.find_elements(By.CLASS_NAME, 'ProgressBarItemContainer__title')
        logging.info(f"Found {len(property_elements)} properties.")
        
        for property_element in property_elements:
            property_name = property_element.text
            properties.append(property_name)
            logging.info(f"Extracted Property Name: {property_name}")

    except Exception as e:
        logging.error(f"Error scraping properties for {locality_url}: {e}")

    finally:
        driver.quit()
        time.sleep(10)  # Add delay to avoid overwhelming the server

    return properties

# Load city-localities data
with open('city_localities.json', 'r') as file:
    city_localities = json.load(file)

# Final structure to store city -> localities -> properties
city_localities_properties = {}

# Iterate over cities and localities
for city, localities_info in city_localities.items():
    city_localities_properties[city] = {}

    for locality_info in localities_info:
        locality_name = locality_info['name']
        locality_url = locality_info['url']

        logging.info(f"Scraping properties for locality: {locality_name} in city: {city}")

        # Scrape properties for each locality
        properties = scrape_properties(locality_url)

        if not properties:
            logging.warning(f"No properties found for locality: {locality_name} in city: {city}")
            continue

        # Store the properties under the respective city and locality
        city_localities_properties[city][locality_name] = properties

        # Save progress incrementally to avoid data loss
        with open('city_localities_properties.json', 'w') as file:
            json.dump(city_localities_properties, file, indent=4)

        # To avoid server overload
        time.sleep(5)

logging.info("Scraping completed. Data saved to city_localities_properties.json.")


2024-09-15 13:17:50,792 - INFO - Scraping properties for locality: Whitefield Overview in city: localities-in-bangalore
2024-09-15 13:17:51,872 - INFO - Starting slow scroll down the page...
2024-09-15 13:17:52,593 - INFO - Scrolled to 408.94 / 6543.00
2024-09-15 13:17:55,602 - INFO - Scrolled to 817.88 / 6543.00
2024-09-15 13:17:58,616 - INFO - Scrolled to 1226.81 / 6543.00
2024-09-15 13:18:01,628 - INFO - Scrolled to 1635.75 / 6543.00
2024-09-15 13:18:04,634 - INFO - Scrolled to 2044.69 / 6543.00
2024-09-15 13:18:07,647 - INFO - Scrolled to 2453.62 / 6543.00
2024-09-15 13:18:10,662 - INFO - Scrolled to 2862.56 / 6543.00
2024-09-15 13:18:13,676 - INFO - Scrolled to 3271.50 / 6543.00
2024-09-15 13:18:16,693 - INFO - Scrolled to 3680.44 / 6543.00
2024-09-15 13:18:19,709 - INFO - Scrolled to 4089.38 / 6543.00
2024-09-15 13:18:22,721 - INFO - Scrolled to 4498.31 / 6543.00
2024-09-15 13:18:25,735 - INFO - Scrolled to 4907.25 / 6543.00
2024-09-15 13:18:28,747 - INFO - Scrolled to 5316.19 / 

KeyboardInterrupt: 