In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import os

# Setup Chrome options
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Uncomment for headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

# Initialize WebDriver
chromedriver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"  # Update with correct path
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Base URL
BASE_URL = "https://www.commonfloor.com/chennai-property/for-sale?page="

# Folder & Progress File
SAVE_FOLDER = "chennai_estate"
PROGRESS_FILE = "final_scraped_page.txt"

# Create folder if not exists
if not os.path.exists(SAVE_FOLDER):
    os.makedirs(SAVE_FOLDER)

# Get last scraped page
def get_final_scraped_page():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as f:
            return int(f.read().strip())  # Read last saved page number
    return 1  # Start from page 1 if no progress file exists

# Save progress
def save_progress(page_number):
    with open(PROGRESS_FILE, "w") as f:
        f.write(str(page_number))

# Scrape a single page
def scrape_page(page_number):
    url = f"{BASE_URL}{page_number}"
    print(f"🔄 Scraping Page {page_number}...")

    driver.get(url)
    time.sleep(3)  # Allow page to load

    # Try to find property elements
    try:
        property_listings = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'snb-tile')]"))
        )
    except:
        print(f"⚠️ No properties found on page {page_number}. Stopping.")
        return None  # No more pages left

    page_data = []

    for listing in property_listings:
        try:
            # Property Name
            try:
                property_name = listing.find_element(By.XPATH, ".//a[contains(@href, '/listing/')]").text.strip()
            except:
                property_name = "N/A"

            # Property Area
            try:
                area = listing.find_element(By.XPATH, ".//span[not(@class)]").text.strip()
            except:
                area = "N/A"

            # Property Value
            try:
                value = listing.find_element(By.XPATH, ".//span[@class='s_p ']").text.strip()
            except:
                value = "N/A"

            # Possession Status
            try:
                possession = listing.find_element(By.XPATH, ".//div[@class='infodata']/span").text.strip()
            except:
                possession = "N/A"

            # Agent Name
            try:
                agent = listing.find_element(By.XPATH, ".//div[@class='infownertext']/small").text.strip()
            except:
                agent = "N/A"

            # Store data
            page_data.append([property_name, area, value, possession, agent])

            print(f"✅ Scraped: {property_name}")

        except Exception as e:
            print(f"⚠️ Skipping listing due to error: {e}")

    save_progress(page_number)  # Save progress after successful page scrape
    return page_data  # Return data for this page

# Get last scraped page
start_page = get_final_scraped_page()
TOTAL_PAGES = 799  # Set an upper limit (update as needed)

# Store data in memory before saving to Excel
all_data = []

# Start scraping from the last saved page
for page in range(start_page, TOTAL_PAGES + 1):
    scraped_data = scrape_page(page)

    if scraped_data is None:
        break  # No more pages found, stop scraping

    all_data.extend(scraped_data)

    # Save to Excel every 3 pages
    if page % 3 == 0:
        df = pd.DataFrame(all_data, columns=["Property Name", "Area", "Value", "Possession Status", "Agent Name"])
        file_name = os.path.join(SAVE_FOLDER, f"chennai_estate_page_{page}.xlsx")
        df.to_excel(file_name, index=False)
        print(f"📂 Saved data to {file_name}")
        all_data = []  # Reset after saving

    # Random sleep to avoid detection
    time.sleep(2)

# Close the driver
driver.quit()
print("✅ Scraping completed!")


🔄 Scraping Page 500...
✅ Scraped: Plot for Sale in Navallur
✅ Scraped: Plot for Sale in Navallur
✅ Scraped: Semi Furnished 3 BHK Villa for Sale in Kanathur
✅ Scraped: Semi Furnished 3 BHK Villa for Sale in Kanathur
✅ Scraped: Semi Furnished 3BHK Apartment for Sale in Ambattur
✅ Scraped: Semi Furnished 3BHK Apartment for Sale in Ambattur
✅ Scraped: Plot for Sale in Vengal
✅ Scraped: Plot for Sale in Vengal
✅ Scraped: Semi Furnished 3BHK Apartment for Sale in Pallikaranai
✅ Scraped: Semi Furnished 3BHK Apartment for Sale in Pallikaranai
✅ Scraped: Plot for Sale in Poonamallee
✅ Scraped: Plot for Sale in Poonamallee
✅ Scraped: Plot for Sale in West Tambaram
✅ Scraped: Plot for Sale in West Tambaram
✅ Scraped: Plot for Sale in Kelambakkam
✅ Scraped: Plot for Sale in Kelambakkam
✅ Scraped: Plot for Sale in Kondavakkam
✅ Scraped: Plot for Sale in Kondavakkam
✅ Scraped: Plot for Sale in Urapakkam West
✅ Scraped: Plot for Sale in Urapakkam West
✅ Scraped: Plot for Sale in Muttukadu
✅ Scraped: 