In [1]:
%pip install selenium webdriver-manager pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
import time
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from concurrent.futures import ThreadPoolExecutor

# Selenium setup
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])

driver_path = "C:\\Users\\yujit\\OneDrive\\Desktop\\chromedriver-win64\\chromedriver.exe"

def create_driver():
    service = Service(driver_path)
    return webdriver.Chrome(service=service, options=chrome_options)

# Define districts to scrape
districts_to_scrape = {
    "08_2": "ONGOLE",
    
}

# Save progress
progress_file = "progress.json"

def save_progress(district, sro, village):
    with open(progress_file, "w") as f:
        json.dump({"district": district, "sro": sro, "village": village}, f)

def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, "r") as f:
            return json.load(f)
    return None

# Scrape function
def scrape_district(district_value, district_name):
    driver = create_driver()
    driver.get("https://registration.ap.gov.in/igrs/reports/Reports/misFormReport")
    wait = WebDriverWait(driver, 10)
    print(f"Scraping district: {district_name}")
    
    try:
        select = Select(wait.until(EC.presence_of_element_located((By.NAME, "district"))))
        select.select_by_value(district_value)
        time.sleep(2)
        
        sro_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "sro")))
        sro_options = {option.get_attribute("value"): option.text for option in sro_dropdown.find_elements(By.TAG_NAME, "option") if option.get_attribute("value")}

        for sro_value, sro_name in sro_options.items():
            select = Select(wait.until(EC.presence_of_element_located((By.NAME, "sro"))))
            select.select_by_value(sro_value)
            time.sleep(2)

            village_dropdown = wait.until(EC.presence_of_element_located((By.NAME, "village")))
            village_options = {option.get_attribute("value"): option.text for option in village_dropdown.find_elements(By.TAG_NAME, "option") if option.get_attribute("value")}

            for village_value, village_name in village_options.items():
                select = Select(wait.until(EC.presence_of_element_located((By.NAME, "village"))))
                select.select_by_value(village_value)
                time.sleep(2)
                
                select = Select(wait.until(EC.presence_of_element_located((By.NAME, "additionalOption"))))
                select.select_by_value("form1")
                
                get_details_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button/span[text()='Get Details']")))
                get_details_button.click()
                time.sleep(3)
                
                try:
                    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, "reportsTable")))
                    table = driver.find_element(By.CLASS_NAME, "reportsTable")
                    rows = table.find_elements(By.TAG_NAME, "tr")
                    data = [[cell.text for cell in row.find_elements(By.TAG_NAME, "td")] for row in rows[1:]]
                    
                    if data:
                        df = pd.DataFrame(data, columns=["S.No.", "Ward No. - Block No.", "Locality", "Unit Rate", "Ground Floor", "First Floor", "Other Floor", "Classification", "Effective Date"])
                        folder_path = f"registration/{sro_name}"
                        os.makedirs(folder_path, exist_ok=True)
                        file_path = os.path.join(folder_path, f"{village_name}.xlsx")
                        df.to_excel(file_path, index=False)
                        print(f"Data saved: {file_path}")

                except TimeoutException:
                    print(f"No Data Found for {village_name}, skipping...")
                    driver.find_element(By.TAG_NAME, "body").click()
                    time.sleep(1)
                
                save_progress(district_value, sro_value, village_value)
    
    except Exception as e:
        print(f"Error scraping {district_name}: {str(e)}")
    finally:
        driver.quit()

# Run scraping with multithreading
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(lambda item: scrape_district(item[0], item[1]), districts_to_scrape.items())

print("Scraping completed!")


Scraping district: ONGOLE
Error scraping ONGOLE: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=134.0.6998.89)
Stacktrace:
	GetHandleVerifier [0x00007FF7CBD1FE45+26629]
	(No symbol) [0x00007FF7CBC86010]
	(No symbol) [0x00007FF7CBB1931A]
	(No symbol) [0x00007FF7CBB04F65]
	(No symbol) [0x00007FF7CBB29E04]
	(No symbol) [0x00007FF7CBB9F85F]
	(No symbol) [0x00007FF7CBBBFA55]
	(No symbol) [0x00007FF7CBB97883]
	(No symbol) [0x00007FF7CBB60550]
	(No symbol) [0x00007FF7CBB61803]
	GetHandleVerifier [0x00007FF7CC0772BD+3529853]
	GetHandleVerifier [0x00007FF7CC08DA22+3621858]
	GetHandleVerifier [0x00007FF7CC0824D3+3575443]
	GetHandleVerifier [0x00007FF7CBDEB77A+860474]
	(No symbol) [0x00007FF7CBC9088F]
	(No symbol) [0x00007FF7CBC8CBC4]
	(No symbol) [0x00007FF7CBC8CD66]
	(No symbol) [0x00007FF7CBC7C2C9]
	BaseThreadInitThunk [0x00007FFE3C42259D+29]
	RtlUserThreadStart [0x00007FFE3DC2AF38+40]

