In [1]:
%pip install selenium pandas openpyxl


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import re
import json
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Selenium setup
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

driver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open the website
url = "https://registration.ap.gov.in/igrs/reports/Reports/misFormReport"
driver.get(url)

# Wait for the page to load
wait = WebDriverWait(driver, 10)
print("Website loaded successfully!")

def select_dropdown_option(dropdown_name, value):
    """Selects an option from a dropdown menu."""
    dropdown = wait.until(EC.presence_of_element_located((By.NAME, dropdown_name)))
    select = Select(dropdown)
    select.select_by_value(value)
    time.sleep(2)  # Allow UI update

# Save progress
progress_file = "scrape_prog.json"

def save_progress(district, sro, village):
    with open(progress_file, "w") as f:
        json.dump({"district": district, "sro": sro, "village": village}, f)

def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, "r") as f:
            return json.load(f)
    return None

# Load previous progress
progress = load_progress()
start_district = progress["district"] if progress else None
start_sro = progress["sro"] if progress else None
start_village = progress["village"] if progress else None

# Get list of all districts
district_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "district")))
district_options = [option.get_attribute("value") for option in district_dropdown.find_elements(By.TAG_NAME, "option") if option.get_attribute("value")]

print(f"Found {len(district_options)} districts!")

# Loop through districts
for district_value in district_options:
    if start_district and district_value != start_district:
        continue  
    start_district = None  # Reset after resuming

    select_dropdown_option("district", district_value)
    
    # Get list of all SROs
    sro_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "sro")))
    sro_options = {option.get_attribute("value"): option.text for option in sro_dropdown.find_elements(By.TAG_NAME, "option") if option.get_attribute("value")}

    for sro_value in sro_options:
        if start_sro and sro_value != start_sro:
            continue  
        start_sro = None  # Reset after resuming

        select_dropdown_option("sro", sro_value)

        # Get list of all villages
        village_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "village")))
        village_options = {option.get_attribute("value"): option.text for option in village_dropdown.find_elements(By.TAG_NAME, "option") if option.get_attribute("value")}

        for village_value in village_options:
            if start_village and village_value != start_village:
                continue  
            start_village = None  # Reset after resuming

            select_dropdown_option("village", village_value)

            # Select Form 1
            select_dropdown_option("additionalOption", "form2")

            # Click "Get Details" button
            get_details_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button/span[text()='Get Details']")))
            get_details_button.click()

            time.sleep(3)  

            try:
                # Wait for the table or handle "No Data Found"
                WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "reportsTable"))
                )
                
                # If table loads, extract data
                table = driver.find_element(By.CLASS_NAME, "reportsTable")
                rows = table.find_elements(By.TAG_NAME, "tr")

                data = []
                for row in rows[1:]:  # Skip header
                    cells = row.find_elements(By.TAG_NAME, "td")
                    data.append([cell.text for cell in cells])

                if data:  
                    df = pd.DataFrame(data, columns=["S.No.", "Ward No. - Block No.", "Locality", "Unit Rate", "Ground Floor", "First Floor", "Other Floor", "Classification", "Effective Date"])

                    # Save data
                    safe_file_name = re.sub(r'[<>:"/\\|?*]', '_', village_options[village_value]).strip()
                    folder_path = f"registration/Form2/{sro_options[sro_value]}"
                    os.makedirs(folder_path, exist_ok=True)
                    file_path = os.path.join(folder_path, f"{safe_file_name}.xlsx")
                    
                    df.to_excel(file_path, index=False)
                    print(f"    Data saved: {file_path}")

            except TimeoutException:
                print(f"    No Data Found for {village_options[village_value]}, skipping...")

                # Click anywhere on the page to reset
                try:
                    driver.find_element(By.TAG_NAME, "body").click()
                    time.sleep(1)  
                except NoSuchElementException:
                    pass  

                # Save progress and continue
                save_progress(district_value, sro_value, village_value)
                continue  

            # Save progress after successful scrape
            save_progress(district_value, sro_value, village_value)

driver.quit()
print("Scraping completed and browser closed!")


Website loaded successfully!
Found 26 districts!
    No Data Found for POKKUNURUA, skipping...
    No Data Found for POPURU, skipping...
    No Data Found for PUNNAVALLI, skipping...
    No Data Found for RAGHAVAPURAM, skipping...
    No Data Found for RAMIREEDIPALLE, skipping...
    No Data Found for RUDRAVARAM, skipping...
    No Data Found for SANAGAPADU, skipping...
    No Data Found for SATYAVARAM, skipping...
    No Data Found for SOMAVARAM, skipping...
    No Data Found for THAKKELLAPADU, skipping...
    No Data Found for THORRAGUDIPADU, skipping...
    No Data Found for THOTACHERLA, skipping...
    No Data Found for THOTARAVULAPADU, skipping...
    No Data Found for THURLAPADU, skipping...
    No Data Found for USTEPALLE, skipping...
    No Data Found for VELADI, skipping...
    No Data Found for AMBAPURAM, skipping...
    No Data Found for JAKKAMPUDI, skipping...
    No Data Found for KOTHURU, skipping...
    No Data Found for KUNDAVARIKANDRIKA, skipping...
    Data saved: reg