In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException,TimeoutException
import os
import pandas as pd
from time import sleep
from urllib.parse import urlparse, parse_qs
import csv


options = webdriver.ChromeOptions()

# Suponiendo monitor vertical de 1080px de alto x 1920 ancho
options.add_argument("--start-maximized")
options.add_argument("window-size=1080,1920")
options.add_argument("window-position=-1080,0")

service = Service(ChromeDriverManager().install())



url = "https://dilrmp.gov.in/faces/rptdistrictwisephysical/rptComputerizationOfLandRecord.xhtml?statecode=9"

# Directorio de salida
output_dir = 'output'
filename = "get_district.csv"
output_path_dist = os.path.join(output_dir, filename)

if not os.path.exists(output_path_dist):
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    # Aplicar zoom 67% con JavaScript
    zoom_level = 0.67
    driver.execute_script(f"document.body.style.zoom = '{zoom_level}'")

    # Obtener el elemento(footer) que nos molesta
    elements = driver.find_elements(By.ID, 'footer')

    if len(elements) > 0:
        elem = elements[0]

        # Ejecutar JS para eliminar ese elemento
        driver.execute_script("""
        var element = arguments[0];
        element.parentNode.removeChild(element);
        """, elem)
    else:
        print("El elemento molestoso (footer) no existe")

    # # Aplicar zoom 67% con JavaScript
    # zoom_level = 0.67
    # driver.execute_script(f"document.body.style.zoom = '{zoom_level}'")

    # Obtiene la URL actual de la página
    url_actual = driver.current_url
    # Analiza la URL
    parsed_url = urlparse(url)
    # Obtiene los parámetros de la URL
    params = parse_qs(parsed_url.query)
    # Obtiene el valor del parámetro 'statecode' (si existe)
    statecode_value = params.get('statecode', [None])[0]

    # Modificar el elemento Options para  mostrar toda la data en un solo plano
    elem = driver.find_element(By.XPATH,'//*[@id="myform:compListTable_rppDD"]/option[3]')
    # Modificar html interno, texto y atributo
    driver.execute_script("arguments[0].innerHTML = '10000'", elem)
    driver.execute_script("arguments[0].setAttribute('value', '10000')", elem)
    # driver.execute_script("arguments[0].click();", elem)
    elem.click()
    sleep(1.35)
    WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="myform"]'))
        )
    data = []

    state = driver.find_element(By.XPATH, '//*[@id="myform"]/table/tbody/tr[1]/td/label')
    state_name = state.text

    districts = driver.find_elements(By.XPATH, '//*[@id="myform:compListTable_data"]/tr/td[2]/a')
    for district in districts:

        links  = district.get_attribute("href")

        district_name = district.text

        data.append({"STATE": state_name, "DISTRICT": district_name, "URL_DISTRICT": links, "STATE_CODE": statecode_value})

        df = pd.DataFrame(data)

    if not os.path.exists(output_dir):
        # Si no existe, la crea
        os.makedirs(output_dir)

    # Guardamos el archivo Excel
    df.index = range(1, len(df) + 1)
    df.to_csv(output_path_dist, index=False)

    driver.quit()
else:
    print(f'Ya existe .\{output_path_dist}')



# Directorio de salida
output_dir = "output"
filename_teh = "get_tehsil.csv"
output_path_teh = os.path.join(output_dir, filename_teh)

# Verificar si el archivo principal existe
if not os.path.exists(output_path_teh):
    print(f"Generating file {filename_teh}")

    # Crear un archivo CSV principal y escribir el encabezado
    with open(output_path_teh, "w") as file:
        file.write("STATE,DISTRICT,TEHSIL,URL_TEHSIL,STATE_CODE,DISTRICT_CODE,URL_DISTRICT\n")

# Leer el archivo CSV de distritos
df = pd.read_csv(output_path_dist)

# Iterar sobre cada fila del DataFrame
for index, row in df.iterrows():
    state = row["STATE"]
    district = row["DISTRICT"]
    url_district = row["URL_DISTRICT"]
    statecode = row["STATE_CODE"]


    # Esto leera en cada iteración el mismo archivo al que se esta appendeando todo lo raspado
    df_backup = pd.read_csv(output_path_teh)

    # Verificar si la URL del distrito ya existe en el archivo de respaldo
    if url_district in df_backup["URL_DISTRICT"].values:
        # print(f"URL district {district} with code {statecode} already exists in backup. Skipping...")
        continue

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url_district)

    # Capturar el código de distrito de la URL actual
    parsed_url = urlparse(driver.current_url)
    districtcode_value = parse_qs(parsed_url.query).get("districtcode", [None])[0]

    # Eliminar el elemento "footer" si existe
    elements = driver.find_elements(By.ID, "footer")
    if elements:
        elem = elements[0]
        driver.execute_script(
            """
            var element = arguments[0];
            element.parentNode.removeChild(element);
            """,
            elem,
        )

    # Modificar el campo de extracción
    elem = driver.find_element(By.XPATH, '//*[@id="myform:compListTable_rppDD"]/option[3]')
    driver.execute_script("arguments[0].innerHTML = '10000'", elem)
    driver.execute_script("arguments[0].setAttribute('value', '10000')", elem)
    elem.click()
    sleep(1.25)

    # Esperar a que la página termine de cargar
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, '//*[@id="myform"]'))
    )

    # Obtener los tehsils y sus URL
    tehsils = driver.find_elements(By.XPATH, '//*[@id="myform:compListTable_data"]/tr/td[2]/a')
    for tehsil in tehsils:
        tehsil_name = tehsil.text
        url_tehsil = tehsil.get_attribute("href")

        with open(output_path_teh, "a") as file:
            file.write(
                f"{state},{district},{tehsil_name},{url_tehsil},{statecode},{districtcode_value},{url_district}\n"
            )
    print(index, state, district, statecode, districtcode_value)

    driver.quit()






import csv
# Directorio de salida
output_dir = "output"
filename_villas = "get_villas.csv"
output_path_villas = os.path.join(output_dir, filename_villas)

# Verificar si el archivo principal existe
if not os.path.exists(output_path_villas):
    print(f"Generating file {filename_villas}")

    # Crear un archivo CSV principal y escribir el encabezado
    with open(output_path_villas, "w") as file:
        file.write(
            "STATE,DISTRICT,TEHSIL,VILLAGE,TOTAL_OF_ROR,"
            "TOTAL_OF_LAND_OWNER,ROR_DATA_ENTRY_AS_OF_P_S,AVAILABILITY_OF_ROR_DISTRIBUTION_P_S,"
            "ROR_LINKAGE_WITH_AA_COMPLETED,ROR_LINKAGE_WITH_AA_ONGOING,"
            "OF_LAND_OWNER_HOLDERS_WHOSE_ROR_L_W_A,"
            "WHETHER_MUTATION_NOTICE_AND_MUTATION_WORKFLOW_A_C,"
            "ISSUANCE_OF_DIGITALLY_SIGNED_ROR,ICT_CHANNEL_OF_DISTRIBUTION_OF_ROR,"
            "STATUS_ENTRY_DATE,STATE_CODE,DISTRICT_CODE,TEHSIL_CODE,"
            "URL_TEHSIL,URL_DISTRICT\n"
        )


# Leer el archivo CSV de distritos
df = pd.read_csv(output_path_teh)

# Iterar sobre cada fila del DataFrame
for index, row in df.iterrows():
    state = row["STATE"]
    district = row["DISTRICT"]
    tehsil = row["TEHSIL"]
    url_tehsil = row["URL_TEHSIL"]
    url_district = row["URL_DISTRICT"]
    statecode = row["STATE_CODE"]
    districtcode = row["DISTRICT_CODE"]

    # Esto leera en cada iteración el mismo archivo al que se esta appendeando todo lo raspado
    df_backup = pd.read_csv(output_path_villas)

    # Verificar si la URL del distrito ya existe en el archivo de respaldo
    if url_tehsil in df_backup["URL_TEHSIL"].values:
        # print(f"URL district {district} with code {statecode} already exists in backup. Skipping...")
        continue

    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url_tehsil)

    # Capturar el código de distrito de la URL actual
    parsed_url = urlparse(driver.current_url)
    tehcode_value = parse_qs(parsed_url.query).get("tehcode", [None])[0]

    # Eliminar el elemento "footer" si existe
    elements = driver.find_elements(By.ID, "footer")
    if elements:
        elem = elements[0]
        driver.execute_script(
            """
            var element = arguments[0];
            element.parentNode.removeChild(element);
            """,
            elem,
        )

    # Modificar el campo de extracción
    elem = driver.find_element(By.XPATH, '//*[@id="j_idt30:compList5_rppDD"]/option[3]')
    driver.execute_script("arguments[0].innerHTML = '10000'", elem)
    driver.execute_script("arguments[0].setAttribute('value', '10000')", elem)
    elem.click()
    sleep(1.15)

    # Esperar a que la página termine de cargar
    WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.ID, "j_idt30"))
    )

    # Localizar el elemento que contiene la tabla (por ejemplo, un tbody)
    table_body = driver.find_elements(By.TAG_NAME, "tbody")[-1]
    # Extraer las filas de la tabla
    rows = table_body.find_elements(By.TAG_NAME, "tr")

    # Escribir los datos en el archivo CSV
    with open(output_path_villas, mode='a', newline='') as file:
        writer = csv.writer(file)

        # Iterar sobre las filas y extraer los datos de cada columna
        for row in rows:
            # Extraer las celdas de la fila
            cells = row.find_elements(By.TAG_NAME, "td")
            # Inicializar una lista para almacenar los datos de la fila
            row_values = [
                state,
                district,
                tehsil,
                *[cell.text for cell in cells[1:]],  # Extraer texto de las celdas, omite la primera
                statecode,
                districtcode,
                tehcode_value,
                url_tehsil,
                url_district
            ]
            writer.writerow(row_values)
    print(index, state, district, statecode, districtcode_value)

    driver.quit()


data

Generating file get_tehsil.csv
0 UTTAR PRADESH Agra 9 146
1 UTTAR PRADESH Aligarh 9 143
2 UTTAR PRADESH Ambedkar Nagar 9 178
3 UTTAR PRADESH Amethi 9 664
